Merge branch 'master' of https://github.com/tensorflow/tensorflow into issue-s390-lite-2

2020-01-15 12:14:25 -08:00 · 2020-01-15 12:14:25 -08:00 · 6661cf9524
commit 6661cf9524
parent 66535979f8 9fc8b64300
4567 changed files with 207481 additions and 251040 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -100,9 +100,9 @@ build --apple_platform_type=macos
 # iOS configs for each architecture and the fat binary builds.
 build:ios --apple_platform_type=ios
 build:ios --apple_bitcode=embedded --copt=-fembed-bitcode
+build:ios --copt=-Wno-c++11-narrowing
 build:ios_armv7 --config=ios
 build:ios_armv7 --cpu=ios_armv7
-build:ios_armv7 --copt -Wno-c++11-narrowing
 build:ios_arm64 --config=ios
 build:ios_arm64 --cpu=ios_arm64
 build:ios_i386 --config=ios
@ -111,7 +111,6 @@ build:ios_x86_64 --config=ios
 build:ios_x86_64 --cpu=ios_x86_64
 build:ios_fat --config=ios
 build:ios_fat --ios_multi_cpus=armv7,arm64,i386,x86_64
-build:ios_fat --copt -Wno-c++11-narrowing

 # Config to use a mostly-static build and disable modular op registration
 # support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
@ -202,18 +201,25 @@ build --define=allow_oversize_protos=true
 build --spawn_strategy=standalone
 build -c opt

-# By default, build TF in C++ 14 mode.
-build --cxxopt=-std=c++14
-build --host_cxxopt=-std=c++14
-
 # Make Bazel print out all options from rc files.
 build --announce_rc

 # Other build flags.
 build --define=grpc_no_ares=true

-# Prevent regression of https://github.com/bazelbuild/bazel/issues/7362
-build --incompatible_remove_legacy_whole_archive
+# See https://github.com/bazelbuild/bazel/issues/7362 for information on what
+# --incompatible_remove_legacy_whole_archive flag does.
+# This flag is set to true in Bazel 1.0 and newer versions. We tried to migrate
+# Tensorflow to the default, however test coverage wasn't enough to catch the
+# errors.
+# There is ongoing work on Bazel team's side to provide support for transitive
+# shared libraries. As part of migrating to transitive shared libraries, we
+# hope to provide a better mechanism for control over symbol exporting, and
+# then tackle this issue again.
+#
+# TODO: Remove this line once TF doesn't depend on Bazel wrapping all library
+# archives in -whole_archive -no_whole_archive.
+build --noincompatible_remove_legacy_whole_archive

 # Modular TF build options
 build:dynamic_kernels --define=dynamic_loaded_kernels=true
@ -224,13 +230,62 @@ build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17

-# Default paths for TF_SYSTEM_LIBS
-build --define=PREFIX=/usr
-build --define=LIBDIR=$(PREFIX)/lib
-build --define=INCLUDEDIR=$(PREFIX)/include
+# Enable using platform specific build settings
+build --enable_platform_specific_config

 # Suppress C++ compiler warnings, otherwise build logs become 10s of MBs.
-build --copt=-w
+build:linux --copt=-w
+build:macos --copt=-w
+build:windows --copt=/w
+
+# Tensorflow uses M_* math constants that only get defined by MSVC headers if
+# _USE_MATH_DEFINES is defined.
+build:windows --copt=/D_USE_MATH_DEFINES
+
+# Default paths for TF_SYSTEM_LIBS
+build:linux --define=PREFIX=/usr
+build:linux --define=LIBDIR=$(PREFIX)/lib
+build:linux --define=INCLUDEDIR=$(PREFIX)/include
+build:macos --define=PREFIX=/usr
+build:macos --define=LIBDIR=$(PREFIX)/lib
+build:macos --define=INCLUDEDIR=$(PREFIX)/include
+# TF_SYSTEM_LIBS do not work on windows.
+
+# By default, build TF in C++ 14 mode.
+build:linux --cxxopt=-std=c++14
+build:linux --host_cxxopt=-std=c++14
+build:macos --cxxopt=-std=c++14
+build:macos --host_cxxopt=-std=c++14
+build:windows --cxxopt=/std:c++14
+build:windows --host_cxxopt=/std:c++14
+
+# On windows, we still link everything into a single DLL.
+build:windows --config=monolithic
+
+# On linux, we dynamically link small amount of kernels
+build:linux --config=dynamic_kernels
+
+# Make sure to include as little of windows.h as possible
+build:windows --copt=-DWIN32_LEAN_AND_MEAN
+build:windows --host_copt=-DWIN32_LEAN_AND_MEAN
+build:windows --copt=-DNOGDI
+build:windows --host_copt=-DNOGDI
+
+# Misc build options we need for windows.
+build:windows --linkopt=/DEBUG
+build:windows --host_linkopt=/DEBUG
+build:windows --linkopt=/OPT:REF
+build:windows --host_linkopt=/OPT:REF
+build:windows --linkopt=/OPT:ICF
+build:windows --host_linkopt=/OPT:ICF
+build:windows --experimental_strict_action_env=true
+build:windows --incompatible_windows_native_test_wrapper
+
+# Verbose failure logs when something goes wrong
+build:windows --verbose_failures
+
+# On windows, we never cross compile
+build:windows --distinct_host_configuration=false

 # Suppress all warning messages.
 build:short_logs --output_filter=DONT_MATCH_ANYTHING
@ -326,29 +381,15 @@ build:rbe_linux_py3 --python_path="/usr/bin/python3"
 build:rbe_linux_py3 --repo_env=TF_PYTHON_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py3"

 build:rbe_win --config=rbe
-build:rbe_win --crosstool_top="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_026:toolchain"
+build:rbe_win --crosstool_top="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_121:toolchain"
 build:rbe_win --extra_execution_platforms="@org_tensorflow//third_party/toolchains/preconfig/win_1803:rbe_windows_1803"
-build:rbe_win --extra_toolchains="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_026:cc-toolchain-x64_windows"
+build:rbe_win --extra_toolchains="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_121:cc-toolchain-x64_windows"
 build:rbe_win --host_javabase="@org_tensorflow//third_party/toolchains/preconfig/win_1803:windows_jdk8"
 build:rbe_win --host_platform="@org_tensorflow//third_party/toolchains/preconfig/win_1803:rbe_windows_1803"
 build:rbe_win --javabase="@org_tensorflow//third_party/toolchains/preconfig/win_1803:windows_jdk8"
 build:rbe_win --platforms="@org_tensorflow//third_party/toolchains/preconfig/win_1803:rbe_windows_1803"
 build:rbe_win --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe

-# Misc build options we need for windows
-build:rbe_win --copt=-DWIN32_LEAN_AND_MEAN
-build:rbe_win --host_copt=-DWIN32_LEAN_AND_MEAN
-build:rbe_win --copt=-DNOGDI
-build:rbe_win --host_copt=-DNOGDI
-build:rbe_win --linkopt=/DEBUG
-build:rbe_win --host_linkopt=/DEBUG
-build:rbe_win --linkopt=/OPT:REF
-build:rbe_win --host_linkopt=/OPT:REF
-build:rbe_win --linkopt=/OPT:ICF
-build:rbe_win --host_linkopt=/OPT:ICF
-build:rbe_win --config=monolithic
-build:rbe_win --experimental_strict_action_env=true
-build:rbe_win --incompatible_windows_native_test_wrapper
 # TODO(gunan): Remove once we use MSVC 2019 with latest patches.
 build:rbe_win --define=override_eigen_strong_inline=true

--- a/.bazelversion
+++ b/.bazelversion
@ -1 +1 @@
-1.1.0
+1.2.1
--- a/.github/ISSUE_TEMPLATE/70-tflite-micro-issue.md
+++ b/.github/ISSUE_TEMPLATE/70-tflite-micro-issue.md
@ -0,0 +1,19 @@
+---
+name: TensorFlow Lite for Microcontrollers Issue
+about: Use this template for reporting issues with TensorFlow Lite for microcontrollers
+labels: 'comp:micro'
+
+---
+
+@tensorflow/micro
+
+**System information**
+- Host OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- TensorFlow installed from (source or binary):
+- Tensorflow version (commit SHA if source):
+- Target platform (e.g. Arm Mbed OS, Arduino Nano 33 etc.):
+
+**Describe the problem**
+
+**Please provide the exact sequence of commands/steps when you ran into the problem**
+
--- a/51
+++ b/51
@ -13,55 +13,4 @@
 /tensorflow/tensorboard/ @jart
 /tensorflow/tools/docs/ @markdaoust

-# contrib
-
-# NEED OWNER: /tensorflow/contrib/all_reduce
-/tensorflow/contrib/autograph/ @mdanatg @kkimdev
-/tensorflow/contrib/batching/ @alextp @chrisolston
-/tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon
-/tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva
-/tensorflow/contrib/checkpoint/ @allenlavoie
-/tensorflow/contrib/contrib/cluster_resolver/ @frankchn
-/tensorflow/contrib/cmake/ @mrry
-/tensorflow/contrib/copy_graph/ @tucker @poxvoculi
-/tensorflow/contrib/crf/ @kentonl
-/tensorflow/contrib/data/ @mrry
-/tensorflow/tensorflow/contrib/distribute @joshl @priyag @sourabhbajaj @frankchn
-/tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi
-/tensorflow/contrib/eager @jaingaurav @alextp
-/tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo
-/tensorflow/contrib/ffmpeg/ @fredbertsch
-/tensorflow/contrib/framework/ @ebrevdo
-/tensorflow/contrib/graph_editor/ @purpledog
-# NEED OWNER: /tensorflow/contrib/grid_rnn/
-/tensorflow/contrib/hadoop @yongtang
-/tensorflow/contrib/hvx/ @satok16
-/tensorflow/contrib/integrate/ @shoyer
-/tensorflow/contrib/kernel_methods/ @petrosmol
-/tensorflow/contrib/ios_examples/ @petewarden
-/tensorflow/contrib/labeled_tensor/ @shoyer
-/tensorflow/contrib/layers/ @fchollet @martinwicke
-/tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp
-/tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis
-/tensorflow/contrib/lookup/ @ysuematsu @andreasst
-/tensorflow/contrib/losses/ @alextp @ispirmustafa
-/tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg
-/tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa
-/tensorflow/contrib/opt/ @strategist333 @alextp
-/tensorflow/contrib/pi_examples/ @maciekcc
-/tensorflow/contrib/quantization/ @petewarden
-/tensorflow/contrib/rnn/ @ebrevdo @scottzhu
-/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenlavoie
-/tensorflow/contrib/seq2seq/ @ebrevdo @lmthang
-/tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
-/tensorflow/contrib/slim/ @sguada @thenbasilmanran
-/tensorflow/contrib/stateless/ @girving @alextp
-/tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
-/tensorflow/contrib/tensorrt/ @aaroey @smit-hinsu @azaks2
-# NEED OWNER: /tensorflow/contrib/testing/
-/tensorflow/contrib/timeseries/ @allenlavoie
-/tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj
-/tensorflow/contrib/training/ @joel-shor @ebrevdo
-/tensorflow/contrib/util/ @sherrym
-
 /third_party/systemlibs/ @perfinion
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -72,7 +72,7 @@ TensorFlow coding style.
    [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core)
    and
    [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
-    TensorFlow has reached version 1 and hence cannot make
+    TensorFlow has passed version 1.0 and hence cannot make
    non-backward-compatible API changes without a major release. Reviewers of
    your pull request will comment on any API compatibility issues.
 *   When you contribute a new feature to TensorFlow, the maintenance burden is
--- a/README.md
+++ b/README.md
@ -37,23 +37,26 @@ See the [TensorFlow install guide](https://www.tensorflow.org/install) for the
 [Docker container](https://www.tensorflow.org/install/docker), and
 [build from source](https://www.tensorflow.org/install/source).

-To install the current release for CPU-only:
+To install the current release, which includes support for
+[CUDA-enabled GPU cards](https://www.tensorflow.org/install/gpu) *(Ubuntu and
+Windows)*:

 ```
 $ pip install tensorflow
 ```

-Use the GPU package for
-[CUDA-enabled GPU cards](https://www.tensorflow.org/install/gpu) *(Ubuntu and
-Windows)*:
+A smaller CPU-only package is also available:

 ```
-$ pip install tensorflow-gpu
+$ pip install tensorflow-cpu
 ```

+To update TensorFlow to the latest version, add `--upgrade` flag to the above
+commands.
+
 *Nightly binaries are available for testing using the
 [tf-nightly](https://pypi.python.org/pypi/tf-nightly) and
-[tf-nightly-gpu](https://pypi.python.org/pypi/tf-nightly-gpu) packages on PyPi.*
+[tf-nightly-cpu](https://pypi.python.org/pypi/tf-nightly-cpu) packages on PyPi.*

 #### *Try your first TensorFlow program*

@ -110,19 +113,19 @@ Build Type               | Status

 ### Community Supported Builds

-Build Type                                                                            | Status                                                                                                                                                                                        | Artifacts
------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**Linux AMD ROCm GPU** Nightly                                                        | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly)                                                  | [Nightly](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/lastSuccessfulBuild/)
-**Linux AMD ROCm GPU** Stable Release                                                 | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/)                                                 | Release [1.15](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/lastSuccessfulBuild/) / [2.x](http://ml-ci.amd.com:21096/job/tensorflow-rocm-v2-release/lastSuccessfulBuild/)
-**Linux s390x** Nightly                                                               | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                             | [Nightly](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)
-**Linux s390x CPU** Stable Release                                                    | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/badge/icon)](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)                                      | [Release](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)
-**Linux ppc64le CPU** Nightly                                                         | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
-**Linux ppc64le CPU** Stable Release                                                  | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                       | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_CPU_Release_Build/)
-**Linux ppc64le GPU** Nightly                                                         | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
-**Linux ppc64le GPU** Stable Release                                                  | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                       | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/)
-**Linux CPU with Intel® MKL-DNN** Nightly                                             | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                     | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
-**Linux CPU with Intel® MKL-DNN** <br> **Supports Python 2.7, 3.4, 3.5, 3.6 and 3.7** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild)      | [1.14.0 PyPI](https://pypi.org/project/intel-tensorflow/)
-**Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6                     | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/)
+Build Type                                                        | Status                                                                                                                                                                                        | Artifacts
+----------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux AMD ROCm GPU** Nightly                                    | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly)                                                  | [Nightly](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/lastSuccessfulBuild/)
+**Linux AMD ROCm GPU** Stable Release                             | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/)                                                 | Release [1.15](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/lastSuccessfulBuild/) / [2.x](http://ml-ci.amd.com:21096/job/tensorflow-rocm-v2-release/lastSuccessfulBuild/)
+**Linux s390x** Nightly                                           | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                             | [Nightly](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)
+**Linux s390x CPU** Stable Release                                | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/badge/icon)](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)                                      | [Release](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)
+**Linux ppc64le CPU** Nightly                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
+**Linux ppc64le CPU** Stable Release                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                       | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_CPU_Release_Build/)
+**Linux ppc64le GPU** Nightly                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
+**Linux ppc64le GPU** Stable Release                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                       | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/)
+**Linux CPU with Intel® MKL-DNN** Nightly                         | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)                     | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
+**Linux CPU with Intel® MKL-DNN** Stable Release                  | ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)                                                                                              | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/)
+**Red Hat® Enterprise Linux® 7.6 CPU & GPU** <br> Python 2.7, 3.6 | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/)

 ## Resources

--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,3 +1,106 @@
+# Release 2.1.0
+
+TensorFlow 2.1 will be the last TF release supporting Python 2. Python 2 support [officially ends an January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update). [As announced earlier](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ), TensorFlow will also stop supporting Python 2 starting January 1, 2020, and no more releases are expected in 2019.
+
+## Major Features and Improvements
+* The `tensorflow` pip package now includes GPU support by default (same as `tensorflow-gpu`) for both Linux and Windows. This runs on machines with and without NVIDIA GPUs. `tensorflow-gpu` is still available, and CPU-only packages can be downloaded at `tensorflow-cpu` for users who are concerned about package size.
+* **Windows users:** Officially-released `tensorflow` Pip packages are now built with Visual Studio 2019 version 16.4 in order to take advantage of the new `/d2ReducedOptimizeHugeFunctions` compiler flag. To use these new packages, you must install "Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017 and 2019", available from Microsoft's website [here](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads).
+  * This does not change the minimum required version for building TensorFlow from source on Windows, but builds enabling `EIGEN_STRONG_INLINE` can take over 48 hours to compile without this flag. Refer to `configure.py` for more information about `EIGEN_STRONG_INLINE` and `/d2ReducedOptimizeHugeFunctions`.
+  * If either of the required DLLs, `msvcp140.dll` (old) or `msvcp140_1.dll` (new), are missing on your machine, `import tensorflow` will print a warning message.
+* The `tensorflow` pip package is built with CUDA 10.1 and cuDNN 7.6.
+* `tf.keras`
+  * Experimental support for mixed precision is available on GPUs and Cloud TPUs. See [usage guide](https://www.tensorflow.org/guide/keras/mixed_precision).
+  * Introduced the `TextVectorization` layer, which takes as input raw strings and takes care of text standardization, tokenization, n-gram generation, and vocabulary indexing. See this [end-to-end text classification example](https://colab.research.google.com/drive/1RvCnR7h0_l4Ekn5vINWToI9TNJdpUZB3).
+  * Keras `.compile` `.fit` `.evaluate` and `.predict` are allowed to be outside of the DistributionStrategy scope, as long as the model was constructed inside of a scope.
+  * Experimental support for Keras `.compile`, `.fit`, `.evaluate`, and `.predict` is available for Cloud TPUs, Cloud TPU, for all types of Keras models (sequential, functional and subclassing models).
+  * Automatic outside compilation is now enabled for Cloud TPUs. This allows `tf.summary` to be used more conveniently with Cloud TPUs.
+  * Dynamic batch sizes with DistributionStrategy and Keras are supported on Cloud TPUs.
+  * Support for `.fit`, `.evaluate`, `.predict` on TPU using numpy data, in addition to `tf.data.Dataset`.
+  * Keras reference implementations for many popular models are available in the TensorFlow [Model Garden](https://github.com/tensorflow/models/tree/master/official).
+* `tf.data`
+  * Changes rebatching for `tf.data datasets` + DistributionStrategy for better performance. Note that the dataset also behaves slightly differently, in that the rebatched dataset cardinality will always be a multiple of the number of replicas.
+  * `tf.data.Dataset` now supports automatic data distribution and sharding in distributed environments, including on TPU pods.
+  * Distribution policies for `tf.data.Dataset` can now be tuned with 1. `tf.data.experimental.AutoShardPolicy(OFF, AUTO, FILE, DATA)` 2. `tf.data.experimental.ExternalStatePolicy(WARN, IGNORE, FAIL)`
+* `tf.debugging`
+  * Add `tf.debugging.enable_check_numerics()` and `tf.debugging.disable_check_numerics()` to help debugging the root causes of issues involving infinities and `NaN`s.
+* `tf.distribute`
+  * Custom training loop support on TPUs and TPU pods is avaiable through `strategy.experimental_distribute_dataset`, `strategy.experimental_distribute_datasets_from_function`, `strategy.experimental_run_v2`, `strategy.reduce`.
+  * Support for a global distribution strategy through `tf.distribute.experimental_set_strategy(),` in addition to `strategy.scope()`.
+* `TensorRT`
+  * [TensorRT 6.0](https://developer.nvidia.com/tensorrt#tensorrt-whats-new) is now supported and enabled by default. This adds support for more TensorFlow ops including Conv3D, Conv3DBackpropInputV2, AvgPool3D, MaxPool3D, ResizeBilinear, and ResizeNearestNeighbor. In addition, the TensorFlow-TensorRT python conversion API is exported as `tf.experimental.tensorrt.Converter`.
+* Environment variable `TF_DETERMINISTIC_OPS` has been added. When set to "true" or "1", this environment variable makes `tf.nn.bias_add` operate deterministically (i.e. reproducibly), but currently only when XLA JIT compilation is *not* enabled. Setting `TF_DETERMINISTIC_OPS` to "true" or "1" also makes cuDNN convolution and max-pooling operate deterministically. This makes Keras Conv\*D and MaxPool\*D layers operate deterministically in both the forward and backward directions when running on a CUDA-enabled GPU.
+
+## Breaking Changes
+* Deletes `Operation.traceback_with_start_lines` for which we know of no usages.
+* Removed `id` from `tf.Tensor.__repr__()` as `id` is not useful other than internal debugging.
+* Some `tf.assert_*` methods now raise assertions at operation creation time if the input tensors' values are known at that time, not during the `session.run()`. This only changes behavior when the graph execution would have resulted in an error. When this happens, a noop is returned and the input tensors are marked non-feedable. In other words, if they are used as keys in `feed_dict` argument to `session.run()`, an error will be raised. Also, because some assert ops don't make it into the graph, the graph structure changes. A different graph can result in different per-op random seeds when they are not given explicitly (most often).
+* The following APIs are not longer experimental: `tf.config.list_logical_devices`, `tf.config.list_physical_devices`, `tf.config.get_visible_devices`, `tf.config.set_visible_devices`, `tf.config.get_logical_device_configuration`, `tf.config.set_logical_device_configuration`.
+* `tf.config.experimentalVirtualDeviceConfiguration` has been renamed to `tf.config.LogicalDeviceConfiguration`.
+* `tf.config.experimental_list_devices` has been removed, please use
+`tf.config.list_logical_devices`.
+
+## Bug Fixes and Other Changes
+* `tf.data`
+  * Fixes concurrency issue with `tf.data.experimental.parallel_interleave` with `sloppy=True`.
+  * Add `tf.data.experimental.dense_to_ragged_batch()`.
+  * Extend `tf.data` parsing ops to support `RaggedTensors`.
+* `tf.distribute`
+  * Fix issue where GRU would crash or give incorrect output when a `tf.distribute.Strategy` was used.
+* `tf.estimator`
+  * Added option in `tf.estimator.CheckpointSaverHook` to not save the `GraphDef`.
+  * Moving the checkpoint reader from swig to pybind11.
+* `tf.keras`
+  * Export `depthwise_conv2d` in `tf.keras.backend`.
+  * In Keras Layers and Models, Variables in `trainable_weights`, `non_trainable_weights`, and `weights` are explicitly deduplicated.
+  * Keras `model.load_weights` now accepts `skip_mismatch` as an argument. This was available in external Keras, and has now been copied over to `tf.keras`.
+  * Fix the input shape caching behavior of Keras convolutional layers.
+  * `Model.fit_generator`, `Model.evaluate_generator`, `Model.predict_generator`, `Model.train_on_batch`, `Model.test_on_batch`, and `Model.predict_on_batch` methods now respect the `run_eagerly` property, and will correctly run using `tf.function` by default. Note that `Model.fit_generator`, `Model.evaluate_generator`, and `Model.predict_generator` are deprecated endpoints. They are subsumed by `Model.fit`, `Model.evaluate`, and `Model.predict` which now support generators and Sequences.
+* `tf.lite`
+  * Legalization for `NMS` ops in TFLite.
+  * add `narrow_range` and `axis` to `quantize_v2` and `dequantize` ops.
+  * Added support for `FusedBatchNormV3` in converter.
+  * Add an `errno`-like field to `NNAPI` delegate for detecting `NNAPI` errors for fallback behaviour.
+  * Refactors `NNAPI` Delegate to support detailed reason why an operation is not accelerated.
+  * Converts hardswish subgraphs into atomic ops.
+* Other
+  * Critical stability updates for TPUs, especially in cases where the XLA compiler produces compilation errors.
+  * TPUs can now be re-initialized multiple times, using `tf.tpu.experimental.initialize_tpu_system`.
+  * Add `RaggedTensor.merge_dims()`.
+  * Added new `uniform_row_length` row-partitioning tensor to `RaggedTensor`.
+  * Add `shape` arg to `RaggedTensor.to_tensor`; Improve speed of `RaggedTensor.to_tensor`.
+  * `tf.io.parse_sequence_example` and `tf.io.parse_single_sequence_example` now support ragged features.
+  * Fix `while_v2` with variables in custom gradient.
+  * Support taking gradients of V2 `tf.cond` and `tf.while_loop` using `LookupTable`.
+  * Fix bug where `vectorized_map` failed on inputs with unknown static shape.
+  * Add preliminary support for sparse CSR matrices.
+  * Tensor equality with `None` now behaves as expected.
+  * Make calls to `tf.function(f)()`, `tf.function(f).get_concrete_function` and `tf.function(f).get_initialization_function` thread-safe.
+  * Extend `tf.identity` to work with CompositeTensors (such as SparseTensor)
+  * Added more `dtypes` and zero-sized inputs to `Einsum` Op and improved its performance
+  * Enable multi-worker `NCCL` `all-reduce` inside functions executing eagerly.
+  * Added complex128 support to `RFFT`, `RFFT2D`, `RFFT3D`, `IRFFT`, `IRFFT2D`, and `IRFFT3D`.
+  * Add `pfor` converter for `SelfAdjointEigV2`.
+  * Add `tf.math.ndtri` and `tf.math.erfinv`.
+  * Add `tf.config.experimental.enable_mlir_bridge` to allow using MLIR compiler bridge in eager model.
+  * Added support for MatrixSolve on Cloud TPU / XLA.
+  * Added `tf.autodiff.ForwardAccumulator` for forward-mode autodiff
+  * Add `LinearOperatorPermutation`.
+  * A few performance optimizations on `tf.reduce_logsumexp`.
+  * Added multilabel handling to `AUC` metric
+  * Optimization on `zeros_like`.
+  * Dimension constructor now requires `None` or types with an `__index__` method.
+  * Add `tf.random.uniform` microbenchmark.
+  * Use `_protogen` suffix for proto library targets instead of `_cc_protogen` suffix.
+  * Moving the checkpoint reader from `swig` to `pybind11`.
+  * `tf.device` & `MirroredStrategy` now supports passing in a `tf.config.LogicalDevice`
+  * If you're building Tensorflow from source, consider using [bazelisk](https://github.com/bazelbuild/bazelisk) to automatically download and use the correct Bazel version. Bazelisk reads the `.bazelversion` file at the root of the project directory.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+8bitmp3, Aaron Ma, AbdüLhamit Yilmaz, Abhai Kollara, aflc, Ag Ramesh, Albert Z. Guo, Alex Torres, amoitra, Andrii Prymostka, angeliand, Anshuman Tripathy, Anthony Barbier, Anton Kachatkou, Anubh-V, Anuja Jakhade, Artem Ryabov, autoih, Bairen Yi, Bas Aarts, Basit Ayantunde, Ben Barsdell, Bhavani Subramanian, Brett Koonce, candy.dc, Captain-Pool, caster, cathy, Chong Yan, Choong Yin Thong, Clayne Robison, Colle, Dan Ganea, David Norman, David Refaeli, dengziming, Diego Caballero, Divyanshu, djshen, Douman, Duncan Riach, EFanZh, Elena Zhelezina, Eric Schweitz, Evgenii Zheltonozhskii, Fei Hu, fo40225, Fred Reiss, Frederic Bastien, Fredrik Knutsson, fsx950223, fwcore, George Grzegorz Pawelczak, George Sterpu, Gian Marco Iodice, Giorgio Arena, giuros01, Gomathi Ramamurthy, Guozhong Zhuang, Haifeng Jin, Haoyu Wu, HarikrishnanBalagopal, HJYOO, Huang Chen-Yi, Ilham Firdausi Putra, Imran Salam, Jared Nielsen, Jason Zaman, Jasper Vicenti, Jeff Daily, Jeff Poznanovic, Jens Elofsson, Jerry Shih, jerryyin, Jesper Dramsch, jim.meyer, Jongwon Lee, Jun Wan, Junyuan Xie, Kaixi Hou, kamalkraj, Kan Chen, Karthik Muthuraman, Keiji Ariyama, Kevin Rose, Kevin Wang, Koan-Sin Tan, kstuedem, Kwabena W. Agyeman, Lakshay Tokas, latyas, Leslie-Fang-Intel, Li, Guizi, Luciano Resende, Lukas Folle, Lukas Geiger, Mahmoud Abuzaina, Manuel Freiberger, Mark Ryan, Martin Mlostek, Masaki Kozuki, Matthew Bentham, Matthew Denton, mbhuiyan, mdfaijul, Muhwan Kim, Nagy Mostafa, nammbash, Nathan Luehr, Nathan Wells, Niranjan Hasabnis, Oleksii Volkovskyi, Olivier Moindrot, olramde, Ouyang Jin, OverLordGoldDragon, Pallavi G, Paul Andrey, Paul Wais, pkanwar23, Pooya Davoodi, Prabindh Sundareson, Rajeshwar Reddy T, Ralovich, Kristof, Refraction-Ray, Richard Barnes, richardbrks, Robert Herbig, Romeo Kienzler, Ryan Mccormick, saishruthi, Saket Khandelwal, Sami Kama, Sana Damani, Satoshi Tanaka, Sergey Mironov, Sergii Khomenko, Shahid, Shawn Presser, ShengYang1, Siddhartha Bagaria, Simon Plovyt, skeydan, srinivasan.narayanamoorthy, Stephen Mugisha, sunway513, Takeshi Watanabe, Taylor Jakobson, TengLu, TheMindVirus, ThisIsIsaac, Tim Gates, Timothy Liu, Tomer Gafner, Trent Lo, Trevor Hickey, Trevor Morris, vcarpani, Wei Wang, Wen-Heng (Jack) Chung, wenshuai, Wenshuai-Xiaomi, wenxizhu, william, William D. Irons, Xinan Jiang, Yannic, Yasir Modak, Yasuhiro Matsumoto, Yong Tang, Yongfeng Gu, Youwei Song, Zaccharie Ramzi, Zhang, Zhenyu Guo, 王振华 (Zhenhua Wang), 韩董, 이중건 Isaac Lee
+
 # Release 1.15.0
 This is the last 1.x release for TensorFlow. We do not expect to update the 1.x branch with features, although we will issue patch releases to fix vulnerabilities for at least one year. 

@ -201,239 +304,387 @@ If you experience any snags when using TF 2.0, please let us know at the [TF 2.0

 ## Bug Fixes and Other Changes

-* `tf.contrib`:
-  * Expose `tf.contrib.proto.*` ops in `tf.io` (they will exist in TF2)
-  
-* `tf.data`:
-  * Add support for TensorArrays to `tf.data Dataset`.
-  * Integrate Ragged Tensors with `tf.data`.
-  * All core and experimental tf.data transformations that input user-defined functions can span multiple devices now.
-  * Extending the TF 2.0 support for `shuffle(..., reshuffle_each_iteration=True)` and `cache()` to work across different Python iterators for the same dataset.
-  * Removing the `experimental_numa_aware` option from `tf.data.Options`.
-  * Add `num_parallel_reads` and passing in a Dataset containing filenames into `TextLineDataset` and `FixedLengthRecordDataset`.
-  * Add support for defaulting the value of `cycle_length` argument of `tf.data.Dataset.interleave` to the number of schedulable CPU cores.
-  * Promoting `tf.data.experimental.enumerate_dataset` to core as `tf.data.Dataset.enumerate`.
-  * Promoting `tf.data.experimental.unbatch` to core as `tf.data.Dataset.unbatch`.
-  * Adds option for introducing slack in the pipeline to reduce CPU contention, via `tf.data.Options().experimental_slack = True`
-  * Added experimental support for parallel batching to `batch()` and `padded_batch()`. This functionality can be enabled through `tf.data.Options()`.
-  * Support cancellation of long-running `reduce`.
-  * Now we use `dataset` node name as prefix instead of the op name, to identify the component correctly in metrics, for pipelines with repeated components.
-  * Improve the performance of datasets using `from_tensors()`.
-  * Promoting `unbatch` from experimental to core API.
-  * Adding support for datasets as inputs to `from_tensors` and `from_tensor_slices` and batching and unbatching of nested datasets.
+*   `tf.contrib`:

-* `tf.distribute`:
-  * Enable `tf.distribute.experimental.MultiWorkerMirroredStrategy` working in eager mode.
-  * Callbacks are supported in `MultiWorkerMirroredStrategy`.
-  * Disable `run_eagerly` and distribution strategy if there are symbolic tensors added to the model using `add_metric` or `add_loss`.
-  * Loss and gradients should now more reliably be correctly scaled w.r.t. the global batch size when using a `tf.distribute.Strategy`.
-  * Set default loss reduction as `AUTO` for improving reliability of loss scaling with distribution strategy and custom training loops. `AUTO` indicates that the reduction option will be determined by the usage context. For almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When used in distribution strategy scope, outside of built-in training loops such as `tf.keras` `compile` and `fit`, we expect reduction value to be 'None' or 'SUM'. Using other values will raise an error.
-  * Support for multi-host `ncclAllReduce` in Distribution Strategy.
+    *   Expose `tf.contrib.proto.*` ops in `tf.io` (they will exist in TF2)

-* `tf.estimator`:
-  * Replace `tf.contrib.estimator.add_metrics` with `tf.estimator.add_metrics`
-  * Use `tf.compat.v1.estimator.inputs` instead of `tf.estimator.inputs`
-  * Replace contrib references with `tf.estimator.experimental.*` for apis in early_s in Estimator
-  * Canned Estimators will now use keras optimizers by default. An error will be raised if tf.train.Optimizers are used, and you will have to switch to tf.keras.optimizers or tf.compat.v1 canned Estimators.
-  * A checkpoint converter for canned Estimators has been provided to transition canned Estimators that are warm started from `tf.train.Optimizers` to `tf.keras.optimizers`.
-  * Losses are scaled in canned estimator v2 and not in the optimizers anymore. If you are using Estimator + distribution strategy + optimikzer v1 then the behavior does not change. This implies that if you are using custom estimator with optimizer v2, you have to scale losses. We have new utilities to help scale losses `tf.nn.compute_average_loss`, `tf.nn.scale_regularization_loss`.
+*   `tf.data`:

-* `tf.keras`:
-  * Premade models (including Linear and WideDeep) have been introduced for the purpose of replacing Premade estimators.
-  * Model saving changes
-  * `model.save` and `tf.saved_model.save` may now save to the TensorFlow SavedModel format. The model can be restored using `tf.keras.models.load_model`. HDF5 files are still supported, and may be used by specifying `save_format="h5"` when saving.
-  * Raw TensorFlow functions can now be used in conjunction with the Keras Functional API during model creation. This obviates the need for users to create Lambda layers in most cases when using the Functional API. Like Lambda layers, TensorFlow functions that result in Variable creation or assign ops are not supported.
-  * Add support for passing list of lists to the `metrics` argument in Keras `compile`.
-  * Add `tf.keras.layers.AbstractRNNCell` as the preferred implementation for RNN cells in TF v2. User can use it to implement RNN cells with custom behavior.
-  * Keras training and validation curves are shown on the same plot when using the TensorBoard callback.
-  * Switched Keras `fit/evaluate/predict` execution to use only a single unified path by default unless eager execution has been explicitly disabled, regardless of input type. This unified path places an eager-friendly training step inside of a `tf.function`. With this 
-   1.  All input types are converted to `Dataset`.
-   2. The path assumes there is always a distribution strategy. when distribution strategy is not specified the path uses a no-op distribution strategy. 
-   3. The training step is wrapped in `tf.function` unless `run_eagerly=True` is set in compile. The single path execution code does not yet support all use cases. We fallback to the existing v1 execution paths if your model contains the following: 
-     1. `sample_weight_mode` in compile 
-     2. `weighted_metrics` in compile 
-     3. v1 optimizer 
-     4. target tensors in compile
-If you are experiencing any issues because of this change, please inform us (file an issue) about your use case and you can unblock yourself by setting `experimental_run_tf_function=False` in compile meanwhile. We have seen couple of use cases where the model usage pattern is not as expected and would not work with this change.
-   1. output tensors of one layer is used in the constructor of another.
-   2. symbolic tensors outside the scope of the model are used in custom loss functions.
-   The flag can be disabled for these cases and ideally the usage pattern will need to be fixed.
-  * Mark Keras `set_session` as `compat.v1` only.
-  * `tf.keras.estimator.model_to_estimator` now supports exporting to `tf.train.Checkpoint format`, which allows the saved checkpoints to be compatible with `model.load_weights`.
-  * `keras.backend.resize_images` (and consequently, `keras.layers.Upsampling2D`) behavior has changed, a bug in the resizing implementation was fixed.
-  * Add an `implementation=3` mode for `tf.keras.layers.LocallyConnected2D` and `tf.keras.layers.LocallyConnected1D` layers using `tf.SparseTensor` to store weights,  allowing a dramatic speedup for large sparse models.
-  * Raise error if `batch_size` argument is used when input is dataset/generator/keras sequence.
-  * Update TF 2.0 `keras.backend.name_scope` to use TF 2.0 `name_scope`.
-  * Add v2 module aliases for losses, metrics, initializers and optimizers: `tf.losses = tf.keras.losses` & `tf.metrics = tf.keras.metrics` &  `tf.initializers = tf.keras.initializers` & `tf.optimizers = tf.keras.optimizers`.
-  * Updates binary cross entropy logic in Keras when input is probabilities. Instead of converting probabilities to logits, we are using the cross entropy formula for probabilities.
-  * Added public APIs for `cumsum` and `cumprod` keras backend functions.
-  * Add support for temporal sample weight mode in subclassed models.
-  * Raise `ValueError` if an integer is passed to the training APIs. 
-  * Added fault-tolerance support for training Keras model via `model.fit()` with `MultiWorkerMirroredStrategy`, tutorial available.
-  * Custom Callback tutorial is now available.
-  * To train with `tf.distribute`, Keras API is recommended over estimator.
-  * `steps_per_epoch` and `steps` arguments are supported with numpy arrays.
-  * New error message when unexpected keys are used in sample_weight/class_weight dictionaries 
-  * Losses are scaled in Keras compile/fit and not in the optimizers anymore. If you are using custom training loop, we have new utilities to help scale losses `tf.nn.compute_average_loss`, `tf.nn.scale_regularization_loss`.
-  * `Layer` apply and add_variable APIs are deprecated.
-  * Added support for channels first data format in cross entropy losses with logits and support for tensors with unknown ranks.
-  * Error messages will be raised if `add_update`, `add_metric`, `add_loss`, activity regularizers are used inside of a control flow branch.
-  * New loss reduction types: 
-    1. `AUTO`: Indicates that the reduction option will be determined by the usage context. For almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When    used with `tf.distribute.Strategy`, outside of built-in training loops such as `tf.keras` `compile` and `fit`, we expect reduction value to be `SUM` or `NONE`. Using `AUTO` in that case will raise an error. 
-    2. `NONE`: Weighted losses with one dimension reduced (axis=-1, or axis specified by loss function). When this reduction type used with built-in Keras training loops like `fit`/`evaluate`, the unreduced vector loss is passed to the optimizer but the reported loss will be a scalar value. 
-    3. `SUM`: Scalar sum of weighted losses. 4. `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses. This reduction type is not supported when used with `tf.distribute.Strategy` outside of built-in training loops like `tf.keras` `compile`/`fit`.
-  * Wraps losses passed to the `compile` API (strings and v1 losses) which are not instances of v2 `Loss` class in `LossWrapper` class. => All losses will now use `SUM_OVER_BATCH_SIZE` reduction as default.
-  * `model.add_loss(symbolic_tensor)` should work in ambient eager.
-  * Update metric name to always reflect what the user has given in compile. Affects following cases 
-    1. When name is given as 'accuracy'/'crossentropy' 
-    2. When an aliased function name is used eg. 'mse' 
-    3. Removing the `weighted` prefix from weighted metric names.
-  * Allow non-Tensors through v2 losses.
-  * Add v2 sparse categorical crossentropy metric.
-  * Add v2 APIs for `AUCCurve` and `AUCSummationMethod` enums.
-  * `add_update` can now be passed a zero-arg callable in order to support turning off the update when setting `trainable=False` on a Layer of a Model compiled with `run_eagerly=True`.
-  * Standardize the LayerNormalization API by replacing the args `norm_axis` and `params_axis` with `axis`.
-  * Fixed critical bugs that help with DenseFeatures usability in TF2
+    *   Add support for TensorArrays to `tf.data Dataset`.
+    *   Integrate Ragged Tensors with `tf.data`.
+    *   All core and experimental tf.data transformations that input
+        user-defined functions can span multiple devices now.
+    *   Extending the TF 2.0 support for `shuffle(...,
+        reshuffle_each_iteration=True)` and `cache()` to work across different
+        Python iterators for the same dataset.
+    *   Removing the `experimental_numa_aware` option from `tf.data.Options`.
+    *   Add `num_parallel_reads` and passing in a Dataset containing filenames
+        into `TextLineDataset` and `FixedLengthRecordDataset`.
+    *   Add support for defaulting the value of `cycle_length` argument of
+        `tf.data.Dataset.interleave` to the number of schedulable CPU cores.
+    *   Promoting `tf.data.experimental.enumerate_dataset` to core as
+        `tf.data.Dataset.enumerate`.
+    *   Promoting `tf.data.experimental.unbatch` to core as
+        `tf.data.Dataset.unbatch`.
+    *   Adds option for introducing slack in the pipeline to reduce CPU
+        contention, via `tf.data.Options().experimental_slack = True`
+    *   Added experimental support for parallel batching to `batch()` and
+        `padded_batch()`. This functionality can be enabled through
+        `tf.data.Options()`.
+    *   Support cancellation of long-running `reduce`.
+    *   Now we use `dataset` node name as prefix instead of the op name, to
+        identify the component correctly in metrics, for pipelines with repeated
+        components.
+    *   Improve the performance of datasets using `from_tensors()`.
+    *   Promoting `unbatch` from experimental to core API.
+    *   Adding support for datasets as inputs to `from_tensors` and
+        `from_tensor_slices` and batching and unbatching of nested datasets.

-* `tf.lite`:
-  * Added evaluation script for `COCO` minival
-  * Add delegate support for `QUANTIZE`.
-  * Add `GATHER` support to NN API delegate.
-  * Added support for TFLiteConverter Python API in 2.0. Contains functions from_saved_model, from_keras_file, and from_concrete_functions.
-  * Add `EXPAND_DIMS` support to NN API delegate TEST.
-  * Add `narrow_range` attribute to QuantizeAndDequantizeV2 and V3.
-  * Added support for `tflite_convert` command line tool in 2.0.
-  * Post-training quantization tool supports quantizing weights shared by multiple operations. The models made with versions of this tool will use INT8 types for weights and will only be executable interpreters from this version onwards.
-  * Post-training quantization tool supports fp16 weights and GPU delegate acceleration for fp16.
-  * Add delegate support for `QUANTIZED_16BIT_LSTM`.
-  * Extracts `NNAPIDelegateKernel` from nnapi_delegate.cc
+*   `tf.distribute`:

-* TensorRT
-  * Add TensorFlow 2.0-compatible `TrtGraphConverterV2` API for TensorRT conversion.
-    TensorRT initialization arguments are now passed wrapped in a named-tuple,
-    `TrtConversionParams`, rather than as separate arguments as in `TrtGraphConverter`.
-  * Changed API to optimize TensorRT enginges during graph optimization. This is now
-    done by calling `converter.build()` where previously `is_dynamic_op=False` would
-    be set.
-  * `converter.convert()` no longer returns a `tf.function`. Now the funtion must be
-    accessed from the saved model.
-  * The `converter.calibrate()` method has been removed. To trigger calibration, a
-    `calibration_input_fn` should be provided to `converter.convert()`.
+    *   Enable `tf.distribute.experimental.MultiWorkerMirroredStrategy` working
+        in eager mode.
+    *   Callbacks are supported in `MultiWorkerMirroredStrategy`.
+    *   Disable `run_eagerly` and distribution strategy if there are symbolic
+        tensors added to the model using `add_metric` or `add_loss`.
+    *   Loss and gradients should now more reliably be correctly scaled w.r.t.
+        the global batch size when using a `tf.distribute.Strategy`.
+    *   Set default loss reduction as `AUTO` for improving reliability of loss
+        scaling with distribution strategy and custom training loops. `AUTO`
+        indicates that the reduction option will be determined by the usage
+        context. For almost all cases this defaults to `SUM_OVER_BATCH_SIZE`.
+        When used in distribution strategy scope, outside of built-in training
+        loops such as `tf.keras` `compile` and `fit`, we expect reduction value
+        to be 'None' or 'SUM'. Using other values will raise an error.
+    *   Support for multi-host `ncclAllReduce` in Distribution Strategy.

-* Other:
-  * Fix accidental quadratic graph construction cost in graph-mode `tf.gradients()`.
-  * ResourceVariable's gather op supports batch dimensions.
-  * ResourceVariable support for `gather_nd`.
-  * `ResourceVariable` and `Variable` no longer accepts `constraint` in the constructor, nor expose it as a @property.
-  * Added gradient for `SparseToDense` op.
-  * Expose a flag that allows the number of threads to vary across Python benchmarks.
-  * `image.resize` in 2.0 now supports gradients for the new resize kernels.
-  * `image.resize` now considers proper pixel centers and has new kernels (incl. anti-aliasing).
-  * Renamed `tf.image` functions to remove duplicate "image" where it is redundant.
-  * Variadic reduce is supported on CPU Variadic reduce is supported on CPU
-  * Remove unused `StringViewVariantWrapper`.
-  * Delete unused `Fingerprint64Map` op registration
-  * Add broadcasting support to `tf.matmul`.
-  * Add C++ Gradient for `BatchMatMulV2`.
-  * Add `tf.math.cumulative_logsumexp` operation.
-  * Add ellipsis (...) support for `tf.einsum()`.
-  * Add expand_composites argument to all `nest.*` methods.
-  * Added `strings.byte_split`.
-  * Add a new "result_type" parameter to `tf.strings.split`.
-  * Add name argument to `tf.string_split` and `tf.strings_split`.
-  * Extend `tf.strings.split` to support inputs with any rank.
-  * Added `tf.random.binomial`.
-  * Added `key` and `skip` methods to `random.experimental.Generator`.
-  * Extend `tf.function` with basic support for CompositeTensors arguments (such as `SparseTensor` and `RaggedTensor`).
-  * `parallel_for.pfor`: add converters for Softmax, LogSoftmax, IsNaN, All, Any, and MatrixSetDiag.
-  * `parallel_for`: add converters for LowerTriangularSolve and Cholesky.
-  * `parallel_for`: add converters for `LogMatrixDeterminant` and `MatrixBandPart`.
-  * `parallel_for`: Add converter for `MatrixDiag`.
-  * `parallel_for`: Add converters for `OneHot`, `LowerBound`, `UpperBound`.
-  * `parallel_for`: add converter for `BroadcastTo`.
-  * Add `pfor` converter for `Squeeze`.
-  * Add `RaggedTensor.placeholder()`.
-  * Add ragged tensor support to `tf.squeeze`.
-  * Update RaggedTensors to support int32 row_splits.
-  * Allow `LinearOperator.solve` to take a `LinearOperator`.
-  * Allow all dtypes for `LinearOperatorCirculant`.
-  * Introduce MaxParallelism method
-  * Add `LinearOperatorHouseholder`.
-  * Adds Philox support to new stateful RNG's XLA path.
-  * Added `TensorSpec` support for CompositeTensors.
-  * Added `tf.linalg.tridiagonal_solve` op.
-  * Added partial_pivoting input parameter to `tf.linalg.tridiagonal_solve`.
-  * Added gradient to `tf.linalg.tridiagonal_solve`.
-  * Added `tf.linalg.tridiagonal_mul op`.
-  * Added GPU implementation of `tf.linalg.tridiagonal_matmul`.
-  * Added `LinearOperatorToeplitz`.
-  * Upgraded LIBXSMM to version 1.11.
-  * Uniform processing of quantized embeddings by Gather and EmbeddingLookup Ops.
-  * Correct a misstatement in the documentation of the sparse softmax cross entropy logit parameter.
-  * Add `tf.ragged.boolean_mask`.
-  * `tf.switch_case` added, which selects a branch_fn based on a branch_index.
-  * The C++ kernel of gather op supports batch dimensions.
-  * Fixed default value and documentation for `trainable` arg of tf.Variable.
-  * `EagerTensor` now supports numpy buffer interface for tensors.
-  * This change bumps the version number of the `FullyConnected` Op to 5.
-  * Added new op: `tf.strings.unsorted_segment_join`.
-  * Added HW acceleration support for `topK_v2`.
-  * CloudBigtable version updated to v0.10.0 BEGIN_PUBLIC CloudBigtable version updated to v0.10.0.
-  * Expose `Head` as public API.
-  * Added `tf.sparse.from_dense` utility function.
-  * Improved ragged tensor support in `TensorFlowTestCase`.
-  * Added a function `nested_value_rowids` for ragged tensors.
-  * Added `tf.ragged.stack`.
-  * Makes the a-normal form transformation in Pyct configurable as to which nodes are converted to variables and which are not.
-  * `ResizeInputTensor` now works for all delegates.
-  * `tf.cond` emits a StatelessIf op if the branch functions are stateless and do not touch any resources.
-  * Add support of local soft device placement for eager op.
-  * Pass partial_pivoting to the `_TridiagonalSolveGrad`.
-  * Add HW acceleration support for `LogSoftMax`.
-  * Add guard to avoid acceleration of L2 Normalization with input rank != 4
-  * Fix memory allocation problem when calling `AddNewInputConstantTensor`.
-  * Delegate application failure leaves interpreter in valid state
-  * `tf.while_loop` emits a StatelessWhile op if the cond and body functions are stateless and do not touch any resources.
-  * `tf.cond`, `tf.while` and if and while in AutoGraph now accept a nonscalar predicate if has a single element. This does not affect non-V2 control flow.
-  * Fix potential security vulnerability where decoding variant tensors from proto could result in heap out of bounds memory access.
-  * Only create a GCS directory object if the object does not already exist.
-  * Introduce `dynamic` constructor argument in Layer and Model, which should be set to `True` when using imperative control flow in the `call` method.
-  * Begin adding Go wrapper for C Eager API.
-  * XLA HLO graphs can be inspected with interactive_graphviz tool now.
-  * Add dataset ops to the graph (or create kernels in Eager execution) during the python Dataset object creation instead doing it during Iterator creation time.
-  * Add `batch_dims` argument to `tf.gather`.
-  * The behavior of `tf.gather` is now correct when `axis=None` and `batch_dims<0`.
-  * Update docstring for gather to properly describe the non-empty `batch_dims` case.
-  * Removing of dtype in the constructor of initializers and partition_info in call.
-  * Add `tf.math.nextafter` op.
-  * Turn on MKL-DNN contraction kernels by default. MKL-DNN dynamically dispatches the best kernel implementation based on CPU vector architecture. To disable them, build with `--define=tensorflow_mkldnn_contraction_kernel=0`.
-  * `tf.linspace(start, stop, num)` now always uses "stop" as last value (for num > 1)
-  * Added top-k to precision and recall to keras metrics.
-  * Add a ragged size op and register it to the op dispatcher
-  * Transitive dependencies on :`pooling_ops` were removed.  Some users may need to add explicit dependencies on :`pooling_ops` if they reference the operators from that library.
-  * Add `CompositeTensor` base class.
-  * Malformed gif images could result in an access out of bounds in the color palette of the frame. This has been fixed now
-  * Add templates and interfaces for creating lookup tables
-  * `Tensor::UnsafeCopyFromInternal` deprecated in favor `Tensor::BitcastFrom`.
-  * In `map_vectorization` optimization, reduce the degree of parallelism in the vectorized map node.
-  * Add variant wrapper for `absl::string_view`.
-  * Add OpKernels for some stateless maps.
-  * DType is no longer convertible to an int. Use `dtype.as_datatype_enum` instead of `int(dtype)` to get the same result.
-  * Support both binary and -1/1 label input in v2 hinge and squared hinge losses.
-  * Added `LinearOperator.adjoint` and `LinearOperator.H` (alias).
-  * Expose CriticalSection in core as `tf.CriticalSection`.
-  * Enhanced graphviz output.
-  * Add opkernel templates for common table operations.
-  * Fix callbacks do not log values in eager mode when a deferred build model is used.
-  * `SignatureDef` util functions have been deprecated.
-  * Update `Fingerprint64Map` to use aliases
-  * Add legacy string flat hash map op kernels.
-  * Add support for `add_metric` in the graph function mode.
-  * Updating cosine similarity loss - removed the negate sign from cosine similarity.
-  * Changed default for gradient accumulation for TPU embeddings to true.
-  * Adds summary trace API for collecting graph and profile information. 
-  * The `precision_mode` argument to `TrtGraphConverter` is now case insensitive.
+*   `tf.estimator`:

+    *   Replace `tf.contrib.estimator.add_metrics` with
+        `tf.estimator.add_metrics`
+    *   Use `tf.compat.v1.estimator.inputs` instead of `tf.estimator.inputs`
+    *   Replace contrib references with `tf.estimator.experimental.*` for apis
+        in early_s in Estimator
+    *   Canned Estimators will now use keras optimizers by default. An error
+        will be raised if tf.train.Optimizers are used, and you will have to
+        switch to tf.keras.optimizers or tf.compat.v1 canned Estimators.
+    *   A checkpoint converter for canned Estimators has been provided to
+        transition canned Estimators that are warm started from
+        `tf.train.Optimizers` to `tf.keras.optimizers`.
+    *   Losses are scaled in canned estimator v2 and not in the optimizers
+        anymore. If you are using Estimator + distribution strategy + optimikzer
+        v1 then the behavior does not change. This implies that if you are using
+        custom estimator with optimizer v2, you have to scale losses. We have
+        new utilities to help scale losses `tf.nn.compute_average_loss`,
+        `tf.nn.scale_regularization_loss`.
+
+*   `tf.keras`:
+
+    *   Premade models (including Linear and WideDeep) have been introduced for
+        the purpose of replacing Premade estimators.
+    *   Model saving changes
+    *   `model.save` and `tf.saved_model.save` may now save to the TensorFlow
+        SavedModel format. The model can be restored using
+        `tf.keras.models.load_model`. HDF5 files are still supported, and may be
+        used by specifying `save_format="h5"` when saving.
+    *   Raw TensorFlow functions can now be used in conjunction with the Keras
+        Functional API during model creation. This obviates the need for users
+        to create Lambda layers in most cases when using the Functional API.
+        Like Lambda layers, TensorFlow functions that result in Variable
+        creation or assign ops are not supported.
+    *   Add support for passing list of lists to the `metrics` argument in Keras
+        `compile`.
+    *   Add `tf.keras.layers.AbstractRNNCell` as the preferred implementation
+        for RNN cells in TF v2. User can use it to implement RNN cells with
+        custom behavior.
+    *   Keras training and validation curves are shown on the same plot when
+        using the TensorBoard callback.
+    *   Switched Keras `fit/evaluate/predict` execution to use only a single
+        unified path by default unless eager execution has been explicitly
+        disabled, regardless of input type. This unified path places an
+        eager-friendly training step inside of a `tf.function`. With this
+    *   All input types are converted to `Dataset`.
+    *   The path assumes there is always a distribution strategy. when
+        distribution strategy is not specified the path uses a no-op
+        distribution strategy.
+    *   The training step is wrapped in `tf.function` unless `run_eagerly=True`
+        is set in compile. The single path execution code does not yet support
+        all use cases. We fallback to the existing v1 execution paths if your
+        model contains the following:
+        1.  `sample_weight_mode` in compile
+        2.  `weighted_metrics` in compile
+        3.  v1 optimizer
+        4.  target tensors in compile If you are experiencing any issues because
+            of this change, please inform us (file an issue) about your use case
+            and you can unblock yourself by setting
+            `experimental_run_tf_function=False` in compile meanwhile. We have
+            seen couple of use cases where the model usage pattern is not as
+            expected and would not work with this change.
+    *   output tensors of one layer is used in the constructor of another.
+    *   symbolic tensors outside the scope of the model are used in custom loss
+        functions. The flag can be disabled for these cases and ideally the
+        usage pattern will need to be fixed.
+    *   Mark Keras `set_session` as `compat.v1` only.
+    *   `tf.keras.estimator.model_to_estimator` now supports exporting to
+        `tf.train.Checkpoint format`, which allows the saved checkpoints to be
+        compatible with `model.load_weights`.
+    *   `keras.backend.resize_images` (and consequently,
+        `keras.layers.Upsampling2D`) behavior has changed, a bug in the resizing
+        implementation was fixed.
+    *   Add an `implementation=3` mode for `tf.keras.layers.LocallyConnected2D`
+        and `tf.keras.layers.LocallyConnected1D` layers using `tf.SparseTensor`
+        to store weights, allowing a dramatic speedup for large sparse models.
+    *   Raise error if `batch_size` argument is used when input is
+        dataset/generator/keras sequence.
+    *   Update TF 2.0 `keras.backend.name_scope` to use TF 2.0 `name_scope`.
+    *   Add v2 module aliases for losses, metrics, initializers and optimizers:
+        `tf.losses = tf.keras.losses` & `tf.metrics = tf.keras.metrics` &
+        `tf.initializers = tf.keras.initializers` & `tf.optimizers =
+        tf.keras.optimizers`.
+    *   Updates binary cross entropy logic in Keras when input is probabilities.
+        Instead of converting probabilities to logits, we are using the cross
+        entropy formula for probabilities.
+    *   Added public APIs for `cumsum` and `cumprod` keras backend functions.
+    *   Add support for temporal sample weight mode in subclassed models.
+    *   Raise `ValueError` if an integer is passed to the training APIs.
+    *   Added fault-tolerance support for training Keras model via `model.fit()`
+        with `MultiWorkerMirroredStrategy`, tutorial available.
+    *   Custom Callback tutorial is now available.
+    *   To train with `tf.distribute`, Keras API is recommended over estimator.
+    *   `steps_per_epoch` and `steps` arguments are supported with numpy arrays.
+    *   New error message when unexpected keys are used in
+        sample_weight/class_weight dictionaries
+    *   Losses are scaled in Keras compile/fit and not in the optimizers
+        anymore. If you are using custom training loop, we have new utilities to
+        help scale losses `tf.nn.compute_average_loss`,
+        `tf.nn.scale_regularization_loss`.
+    *   `Layer` apply and add_variable APIs are deprecated.
+    *   Added support for channels first data format in cross entropy losses
+        with logits and support for tensors with unknown ranks.
+    *   Error messages will be raised if `add_update`, `add_metric`, `add_loss`,
+        activity regularizers are used inside of a control flow branch.
+    *   New loss reduction types:
+    *   `AUTO`: Indicates that the reduction option will be determined by the
+        usage context. For almost all cases this defaults to
+        `SUM_OVER_BATCH_SIZE`. When used with `tf.distribute.Strategy`, outside
+        of built-in training loops such as `tf.keras` `compile` and `fit`, we
+        expect reduction value to be `SUM` or `NONE`. Using `AUTO` in that case
+        will raise an error.
+    *   `NONE`: Weighted losses with one dimension reduced (axis=-1, or axis
+        specified by loss function). When this reduction type used with built-in
+        Keras training loops like `fit`/`evaluate`, the unreduced vector loss is
+        passed to the optimizer but the reported loss will be a scalar value.
+    *   `SUM`: Scalar sum of weighted losses. 4. `SUM_OVER_BATCH_SIZE`: Scalar
+        `SUM` divided by number of elements in losses. This reduction type is
+        not supported when used with `tf.distribute.Strategy` outside of
+        built-in training loops like `tf.keras` `compile`/`fit`.
+    *   Wraps losses passed to the `compile` API (strings and v1 losses) which
+        are not instances of v2 `Loss` class in `LossWrapper` class. => All
+        losses will now use `SUM_OVER_BATCH_SIZE` reduction as default.
+    *   `model.add_loss(symbolic_tensor)` should work in ambient eager.
+    *   Update metric name to always reflect what the user has given in compile.
+        Affects following cases
+    *   When name is given as 'accuracy'/'crossentropy'
+    *   When an aliased function name is used eg. 'mse'
+    *   Removing the `weighted` prefix from weighted metric names.
+    *   Allow non-Tensors through v2 losses.
+    *   Add v2 sparse categorical crossentropy metric.
+    *   Add v2 APIs for `AUCCurve` and `AUCSummationMethod` enums.
+    *   `add_update` can now be passed a zero-arg callable in order to support
+        turning off the update when setting `trainable=False` on a Layer of a
+        Model compiled with `run_eagerly=True`.
+    *   Standardize the LayerNormalization API by replacing the args `norm_axis`
+        and `params_axis` with `axis`.
+    *   Fixed critical bugs that help with DenseFeatures usability in TF2
+
+*   `tf.lite`:
+
+    *   Added evaluation script for `COCO` minival
+    *   Add delegate support for `QUANTIZE`.
+    *   Add `GATHER` support to NN API delegate.
+    *   Added support for TFLiteConverter Python API in 2.0. Contains functions
+        from_saved_model, from_keras_file, and from_concrete_functions.
+    *   Add `EXPAND_DIMS` support to NN API delegate TEST.
+    *   Add `narrow_range` attribute to QuantizeAndDequantizeV2 and V3.
+    *   Added support for `tflite_convert` command line tool in 2.0.
+    *   Post-training quantization tool supports quantizing weights shared by
+        multiple operations. The models made with versions of this tool will use
+        INT8 types for weights and will only be executable interpreters from
+        this version onwards.
+    *   Post-training quantization tool supports fp16 weights and GPU delegate
+        acceleration for fp16.
+    *   Add delegate support for `QUANTIZED_16BIT_LSTM`.
+    *   Extracts `NNAPIDelegateKernel` from nnapi_delegate.cc
+
+*   TensorRT
+
+    *   Add TensorFlow 2.0-compatible `TrtGraphConverterV2` API for TensorRT
+        conversion. TensorRT initialization arguments are now passed wrapped in
+        a named-tuple, `TrtConversionParams`, rather than as separate arguments
+        as in `TrtGraphConverter`.
+    *   Changed API to optimize TensorRT enginges during graph optimization.
+        This is now done by calling `converter.build()` where previously
+        `is_dynamic_op=False` would be set.
+    *   `converter.convert()` no longer returns a `tf.function`. Now the
+        function must be accessed from the saved model.
+    *   The `converter.calibrate()` method has been removed. To trigger
+        calibration, a `calibration_input_fn` should be provided to
+        `converter.convert()`.
+
+*   Other:
+
+    *   Fix accidental quadratic graph construction cost in graph-mode
+        `tf.gradients()`.
+    *   ResourceVariable's gather op supports batch dimensions.
+    *   ResourceVariable support for `gather_nd`.
+    *   `ResourceVariable` and `Variable` no longer accepts `constraint` in the
+        constructor, nor expose it as a @property.
+    *   Added gradient for `SparseToDense` op.
+    *   Expose a flag that allows the number of threads to vary across Python
+        benchmarks.
+    *   `image.resize` in 2.0 now supports gradients for the new resize kernels.
+    *   `image.resize` now considers proper pixel centers and has new kernels
+        (incl. anti-aliasing).
+    *   Renamed `tf.image` functions to remove duplicate "image" where it is
+        redundant.
+    *   Variadic reduce is supported on CPU Variadic reduce is supported on CPU
+    *   Remove unused `StringViewVariantWrapper`.
+    *   Delete unused `Fingerprint64Map` op registration
+    *   Add broadcasting support to `tf.matmul`.
+    *   Add C++ Gradient for `BatchMatMulV2`.
+    *   Add `tf.math.cumulative_logsumexp` operation.
+    *   Add ellipsis (...) support for `tf.einsum()`.
+    *   Add expand_composites argument to all `nest.*` methods.
+    *   Added `strings.byte_split`.
+    *   Add a new "result_type" parameter to `tf.strings.split`.
+    *   Add name argument to `tf.string_split` and `tf.strings_split`.
+    *   Extend `tf.strings.split` to support inputs with any rank.
+    *   Added `tf.random.binomial`.
+    *   Added `key` and `skip` methods to `random.experimental.Generator`.
+    *   Extend `tf.function` with basic support for CompositeTensors arguments
+        (such as `SparseTensor` and `RaggedTensor`).
+    *   `parallel_for.pfor`: add converters for Softmax, LogSoftmax, IsNaN, All,
+        Any, and MatrixSetDiag.
+    *   `parallel_for`: add converters for LowerTriangularSolve and Cholesky.
+    *   `parallel_for`: add converters for `LogMatrixDeterminant` and
+        `MatrixBandPart`.
+    *   `parallel_for`: Add converter for `MatrixDiag`.
+    *   `parallel_for`: Add converters for `OneHot`, `LowerBound`, `UpperBound`.
+    *   `parallel_for`: add converter for `BroadcastTo`.
+    *   Add `pfor` converter for `Squeeze`.
+    *   Add `RaggedTensor.placeholder()`.
+    *   Add ragged tensor support to `tf.squeeze`.
+    *   Update RaggedTensors to support int32 row_splits.
+    *   Allow `LinearOperator.solve` to take a `LinearOperator`.
+    *   Allow all dtypes for `LinearOperatorCirculant`.
+    *   Introduce MaxParallelism method
+    *   Add `LinearOperatorHouseholder`.
+    *   Adds Philox support to new stateful RNG's XLA path.
+    *   Added `TensorSpec` support for CompositeTensors.
+    *   Added `tf.linalg.tridiagonal_solve` op.
+    *   Added partial_pivoting input parameter to `tf.linalg.tridiagonal_solve`.
+    *   Added gradient to `tf.linalg.tridiagonal_solve`.
+    *   Added `tf.linalg.tridiagonal_mul op`.
+    *   Added GPU implementation of `tf.linalg.tridiagonal_matmul`.
+    *   Added `LinearOperatorToeplitz`.
+    *   Upgraded LIBXSMM to version 1.11.
+    *   Uniform processing of quantized embeddings by Gather and EmbeddingLookup
+        Ops.
+    *   Correct a misstatement in the documentation of the sparse softmax cross
+        entropy logit parameter.
+    *   Add `tf.ragged.boolean_mask`.
+    *   `tf.switch_case` added, which selects a branch_fn based on a
+        branch_index.
+    *   The C++ kernel of gather op supports batch dimensions.
+    *   Fixed default value and documentation for `trainable` arg of
+        tf.Variable.
+    *   `EagerTensor` now supports numpy buffer interface for tensors.
+    *   This change bumps the version number of the `FullyConnected` Op to 5.
+    *   Added new op: `tf.strings.unsorted_segment_join`.
+    *   Added HW acceleration support for `topK_v2`.
+    *   CloudBigtable version updated to v0.10.0 BEGIN_PUBLIC CloudBigtable
+        version updated to v0.10.0.
+    *   Expose `Head` as public API.
+    *   Added `tf.sparse.from_dense` utility function.
+    *   Improved ragged tensor support in `TensorFlowTestCase`.
+    *   Added a function `nested_value_rowids` for ragged tensors.
+    *   Added `tf.ragged.stack`.
+    *   Makes the a-normal form transformation in Pyct configurable as to which
+        nodes are converted to variables and which are not.
+    *   `ResizeInputTensor` now works for all delegates.
+    *   `tf.cond` emits a StatelessIf op if the branch functions are stateless
+        and do not touch any resources.
+    *   Add support of local soft device placement for eager op.
+    *   Pass partial_pivoting to the `_TridiagonalSolveGrad`.
+    *   Add HW acceleration support for `LogSoftMax`.
+    *   Add guard to avoid acceleration of L2 Normalization with input rank != 4
+    *   Fix memory allocation problem when calling `AddNewInputConstantTensor`.
+    *   Delegate application failure leaves interpreter in valid state
+    *   `tf.while_loop` emits a StatelessWhile op if the cond and body functions
+        are stateless and do not touch any resources.
+    *   `tf.cond`, `tf.while` and if and while in AutoGraph now accept a
+        nonscalar predicate if has a single element. This does not affect non-V2
+        control flow.
+    *   Fix potential security vulnerability where decoding variant tensors from
+        proto could result in heap out of bounds memory access.
+    *   Only create a GCS directory object if the object does not already exist.
+    *   Introduce `dynamic` constructor argument in Layer and Model, which
+        should be set to `True` when using imperative control flow in the `call`
+        method.
+    *   Begin adding Go wrapper for C Eager API.
+    *   XLA HLO graphs can be inspected with interactive_graphviz tool now.
+    *   Add dataset ops to the graph (or create kernels in Eager execution)
+        during the python Dataset object creation instead doing it during
+        Iterator creation time.
+    *   Add `batch_dims` argument to `tf.gather`.
+    *   The behavior of `tf.gather` is now correct when `axis=None` and
+        `batch_dims<0`.
+    *   Update docstring for gather to properly describe the non-empty
+        `batch_dims` case.
+    *   Removing of dtype in the constructor of initializers and partition_info
+        in call.
+    *   Add `tf.math.nextafter` op.
+    *   Turn on MKL-DNN contraction kernels by default. MKL-DNN dynamically
+        dispatches the best kernel implementation based on CPU vector
+        architecture. To disable them, build with
+        `--define=tensorflow_mkldnn_contraction_kernel=0`.
+    *   `tf.linspace(start, stop, num)` now always uses "stop" as last value
+        (for num > 1)
+    *   Added top-k to precision and recall to keras metrics.
+    *   Add a ragged size op and register it to the op dispatcher
+    *   Transitive dependencies on :`pooling_ops` were removed. Some users may
+        need to add explicit dependencies on :`pooling_ops` if they reference
+        the operators from that library.
+    *   Add `CompositeTensor` base class.
+    *   Malformed gif images could result in an access out of bounds in the
+        color palette of the frame. This has been fixed now
+    *   Add templates and interfaces for creating lookup tables
+    *   `Tensor::UnsafeCopyFromInternal` deprecated in favor
+        `Tensor::BitcastFrom`.
+    *   In `map_vectorization` optimization, reduce the degree of parallelism in
+        the vectorized map node.
+    *   Add variant wrapper for `absl::string_view`.
+    *   Add OpKernels for some stateless maps.
+    *   DType is no longer convertible to an int. Use `dtype.as_datatype_enum`
+        instead of `int(dtype)` to get the same result.
+    *   Support both binary and -1/1 label input in v2 hinge and squared hinge
+        losses.
+    *   Added `LinearOperator.adjoint` and `LinearOperator.H` (alias).
+    *   Expose CriticalSection in core as `tf.CriticalSection`.
+    *   Enhanced graphviz output.
+    *   Add opkernel templates for common table operations.
+    *   Fix callbacks do not log values in eager mode when a deferred build
+        model is used.
+    *   `SignatureDef` util functions have been deprecated.
+    *   Update `Fingerprint64Map` to use aliases
+    *   Add legacy string flat hash map op kernels.
+    *   Add support for `add_metric` in the graph function mode.
+    *   Updating cosine similarity loss - removed the negate sign from cosine
+        similarity.
+    *   Changed default for gradient accumulation for TPU embeddings to true.
+    *   Adds summary trace API for collecting graph and profile information.
+    *   The `precision_mode` argument to `TrtGraphConverter` is now case
+        insensitive.

 ## Thanks to our Contributors

@ -715,7 +966,7 @@ Weweler, Zantares, zjjott, 卜居, 王振华 (Wang Zhenhua), 黄鑫

 *   Updates `png_archive` dependency to 1.6.37 to not be affected by
    CVE-2019-7317, CVE-2018-13785, and CVE-2018-14048.
-*   Updates `sqlite` depenency to 3.28.0 to not be affected by CVE-2018-20506,
+*   Updates `sqlite` dependency to 3.28.0 to not be affected by CVE-2018-20506,
    CVE-2018-20346, and CVE-2018-20505.

 # Release 1.12.2
@ -901,9 +1152,9 @@ Weweler, Zantares, zjjott, 卜居, 王振华 (Wang Zhenhua), 黄鑫
        compilation as a second return argument.
    *   XLA HLO graphs can now be rendered as SVG/HTML.
 *   Estimator
-    *   Replace all occurences of `tf.contrib.estimator.BaselineEstimator` with
+    *   Replace all occurrences of `tf.contrib.estimator.BaselineEstimator` with
        `tf.estimator.BaselineEstimator`
-    *   Replace all occurences of
+    *   Replace all occurrences of
        `tf.contrib.estimator.DNNLinearCombinedEstimator` with
        `tf.estimator.DNNLinearCombinedEstimator`
    *   Replace all occurrences of `tf.contrib.estimator.DNNEstimator` with
@ -915,7 +1166,7 @@ Weweler, Zantares, zjjott, 卜居, 王振华 (Wang Zhenhua), 黄鑫
        `tf.estimator.Estimator.experimental_export_all_saved_models`.
    *   Update `regression_head` to the new Head API for Canned Estimator V2.
    *   Switch `multi_class_head` to Head API for Canned Estimator V2.
-    *   Replace all occurences of `tf.contrib.estimator.InMemoryEvaluatorHook`
+    *   Replace all occurrences of `tf.contrib.estimator.InMemoryEvaluatorHook`
        and `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with
        `tf.estimator.experimental.InMemoryEvaluatorHook` and
        `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`
--- a/2
+++ b/2
@ -89,7 +89,7 @@ swift_rules_dependencies()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.19.0")
+check_bazel_version_at_least("1.0.0")

 load("//third_party/android:android_configure.bzl", "android_configure")
 android_configure(name="local_config_android")
--- a/configure.py
+++ b/configure.py
@ -33,7 +33,7 @@ except ImportError:
  from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top

-_DEFAULT_CUDA_VERSION = '10.1'
+_DEFAULT_CUDA_VERSION = '10'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_TENSORRT_VERSION = '6'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
@ -49,8 +49,8 @@ _TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
 _TF_WORKSPACE_ROOT = ''
 _TF_BAZELRC = ''
 _TF_CURRENT_BAZEL_VERSION = None
-_TF_MIN_BAZEL_VERSION = '0.27.1'
-_TF_MAX_BAZEL_VERSION = '1.1.0'
+_TF_MIN_BAZEL_VERSION = '1.2.1'
+_TF_MAX_BAZEL_VERSION = '1.2.1'

 NCCL_LIB_PATHS = [
    'lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', ''
@ -147,14 +147,16 @@ def write_action_env_to_bazelrc(var_name, var):
  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))


-def run_shell(cmd, allow_non_zero=False):
+def run_shell(cmd, allow_non_zero=False, stderr=None):
+  if stderr is None:
+    stderr = sys.stdout
  if allow_non_zero:
    try:
-      output = subprocess.check_output(cmd)
+      output = subprocess.check_output(cmd, stderr=stderr)
    except subprocess.CalledProcessError as e:
      output = e.output
  else:
-    output = subprocess.check_output(cmd)
+    output = subprocess.check_output(cmd, stderr=stderr)
  return output.decode('UTF-8').strip()


@ -169,10 +171,12 @@ def get_python_path(environ_cp, python_bin_path):
  if environ_cp.get('PYTHONPATH'):
    python_paths = environ_cp.get('PYTHONPATH').split(':')
  try:
+    stderr = open(os.devnull, 'wb')
    library_paths = run_shell([
        python_bin_path, '-c',
        'import site; print("\\n".join(site.getsitepackages()))'
-    ]).split('\n')
+    ],
+                              stderr=stderr).split('\n')
  except subprocess.CalledProcessError:
    library_paths = [
        run_shell([
@ -1179,10 +1183,17 @@ def system_specific_test_config(env):
      write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
    else:
      test_and_build_filters.append('-gpu')
-  write_to_bazelrc('test --test_tag_filters=%s' %
+
+  # Disable tests with "v1only" tag in "v2" Bazel config, but not in "v1" config
+  write_to_bazelrc('test:v1 --test_tag_filters=%s' %
                   ','.join(test_and_build_filters + test_only_filters))
-  write_to_bazelrc('test --build_tag_filters=%s' %
+  write_to_bazelrc('test:v1 --build_tag_filters=%s' %
                   ','.join(test_and_build_filters))
+  write_to_bazelrc(
+      'test:v2 --test_tag_filters=%s' %
+      ','.join(test_and_build_filters + test_only_filters + ['-v1only']))
+  write_to_bazelrc('test:v2 --build_tag_filters=%s' %
+                   ','.join(test_and_build_filters + ['-v1only']))


 def set_system_libs_flag(environ_cp):
@ -1232,20 +1243,6 @@ def is_reduced_optimize_huge_functions_available(environ_cp):

 def set_windows_build_flags(environ_cp):
  """Set Windows specific build options."""
-  # The non-monolithic build is not supported yet
-  write_to_bazelrc('build --config monolithic')
-  # Suppress warning messages
-  write_to_bazelrc('build --copt=-w --host_copt=-w')
-  # Fix winsock2.h conflicts
-  write_to_bazelrc(
-      'build --copt=-DWIN32_LEAN_AND_MEAN --host_copt=-DWIN32_LEAN_AND_MEAN '
-      '--copt=-DNOGDI --host_copt=-DNOGDI')
-  # Output more verbose information when something goes wrong
-  write_to_bazelrc('build --verbose_failures')
-  # The host and target platforms are the same in Windows build. So we don't
-  # have to distinct them. This avoids building the same targets twice.
-  write_to_bazelrc('build --distinct_host_configuration=false')
-
  if is_reduced_optimize_huge_functions_available(environ_cp):
    write_to_bazelrc(
        'build --copt=/d2ReducedOptimizeHugeFunctions --host_copt=/d2ReducedOptimizeHugeFunctions'
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -2,12 +2,9 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.

-load("//tensorflow:tensorflow.bzl", "VERSION")
-load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
-load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "VERSION", "tf_cc_shared_object", "tf_custom_op_library_additional_deps_impl", "tf_native_cc_binary")
 load(
-    "//tensorflow/core/platform:default/build_config.bzl",
+    "//tensorflow/core/platform:build_config.bzl",
    "tf_additional_binary_deps",
 )
 load(
@ -198,6 +195,12 @@ config_setting(
    visibility = ["//visibility:public"],
 )

+config_setting(
+    name = "chromiumos",
+    values = {"crosstool_top": "//external:android/chromiumos"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
    name = "linux_aarch64",
    values = {"cpu": "aarch64"},
@ -450,11 +453,13 @@ config_setting(
 package_group(
    name = "internal",
    packages = [
+        "//learning/brain/swift/x10/...",
        "//perftools/accelerators/xprof/api/...",
        "//tensorflow/...",
        "//tensorflow_estimator/python/estimator/...",
        "//tensorflow_models/official/...",
        "//third_party/py/autograph/...",
+        "//third_party/swift/tensorflow/x10/...",
    ],
 )

@ -473,7 +478,7 @@ bzl_library(
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/core/platform:build_config_root_bzl",
-        "//tensorflow/core/platform:cuda_build_defs_bzl",
+        "//tensorflow/core/platform/default:cuda_build_defs_bzl",
        "//third_party/mkl:build_defs_bzl",
        "//third_party/mkl_dnn:build_defs_bzl",
        "//third_party/ngraph:build_defs_bzl",
@ -855,7 +860,7 @@ gen_api_init_files(
    output_files = TENSORFLOW_API_INIT_FILES_V1,
    output_package = "tensorflow._api.v1",
    root_file_name = "v1.py",
-    root_init_template = "api_template_v1.__init__.py",
+    root_init_template = "$(location api_template_v1.__init__.py)",
 )

 gen_api_init_files(
@ -878,7 +883,7 @@ gen_api_init_files(
    output_files = TENSORFLOW_API_INIT_FILES_V2,
    output_package = "tensorflow._api.v2",
    root_file_name = "v2.py",
-    root_init_template = "api_template.__init__.py",
+    root_init_template = "$(location api_template.__init__.py)",
 )

 py_library(
--- a/tensorflow/api_template.init.py
+++ b/tensorflow/api_template.init.py
@ -89,6 +89,7 @@ except ImportError:
 # Enable TF2 behaviors
 from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
 _compat.enable_v2_behavior()
+_major_api_version = 2


 # Load all plugin libraries from site-packages/tensorflow-plugins if we are
@ -119,11 +120,17 @@ def _running_from_pip_package():
      _current_file_location.startswith(dir_) for dir_ in _site_packages_dirs)

 if _running_from_pip_package():
-  for s in _site_packages_dirs:
-    # TODO(gunan): Add sanity checks to loaded modules here.
-    plugin_dir = _os.path.join(s, 'tensorflow-plugins')
-    if _fi.file_exists(plugin_dir):
-      _ll.load_library(plugin_dir)
+  # TODO(gunan): Add sanity checks to loaded modules here.
+  for _s in _site_packages_dirs:
+    # Load first party dynamic kernels.
+    _main_dir = _os.path.join(_s, 'tensorflow_core/core/kernels')
+    if _fi.file_exists(_main_dir):
+      _ll.load_library(_main_dir)
+
+    # Load third party dynamic kernels.
+    _plugin_dir = _os.path.join(_s, 'tensorflow-plugins')
+    if _fi.file_exists(_plugin_dir):
+      _ll.load_library(_plugin_dir)

 # Add module aliases
 if hasattr(_current_module, 'keras'):
@ -136,3 +143,5 @@ if hasattr(_current_module, 'keras'):
  setattr(_current_module, "optimizers", optimizers)
  setattr(_current_module, "initializers", initializers)
 # pylint: enable=undefined-variable
+
+# __all__ PLACEHOLDER
--- a/tensorflow/api_template_v1.init.py
+++ b/tensorflow/api_template_v1.init.py
@ -104,6 +104,8 @@ from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-
 _current_module.app.flags = flags  # pylint: disable=undefined-variable
 setattr(_current_module, "flags", flags)

+_major_api_version = 1
+
 # Load all plugin libraries from site-packages/tensorflow-plugins if we are
 # running under pip.
 # TODO(gunan): Enable setting an environment variable to define arbitrary plugin
@ -132,9 +134,16 @@ def _running_from_pip_package():
      _current_file_location.startswith(dir_) for dir_ in _site_packages_dirs)

 if _running_from_pip_package():
-  for s in _site_packages_dirs:
-    # TODO(gunan): Add sanity checks to loaded modules here.
-    plugin_dir = _os.path.join(s, 'tensorflow-plugins')
-    if _fi.file_exists(plugin_dir):
-      _ll.load_library(plugin_dir)
+  # TODO(gunan): Add sanity checks to loaded modules here.
+  for _s in _site_packages_dirs:
+    # Load first party dynamic kernels.
+    _main_dir = _os.path.join(_s, 'tensorflow_core/core/kernels')
+    if _fi.file_exists(_main_dir):
+      _ll.load_library(_main_dir)

+    # Load third party dynamic kernels.
+    _plugin_dir = _os.path.join(_s, 'tensorflow-plugins')
+    if _fi.file_exists(_plugin_dir):
+      _ll.load_library(_plugin_dir)
+
+# __all__ PLACEHOLDER
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@ -53,6 +53,20 @@ filegroup(
    visibility = ["//visibility:public"],
 )

+filegroup(
+    name = "pywrap_eager_hdrs",
+    srcs = [
+        "c_api_internal.h",
+        "tf_status_helper.h",
+        "tf_status_internal.h",
+        "tf_tensor_internal.h",
+    ],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
 tf_cuda_library(
    name = "c_api_internal",
    hdrs = [
@ -108,6 +122,7 @@ tf_cuda_library(
        ":tf_attrtype",
        ":tf_status_internal",
        ":tf_file_statistics",
+        ":tf_tensor_internal",
    ] + select({
        "//tensorflow:with_xla_support": [
            "//tensorflow/compiler/tf2xla:xla_compiler",
@ -196,6 +211,12 @@ cc_library(
    }),
 )

+cc_library(
+    name = "tf_status_headers",
+    hdrs = ["tf_status.h"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
    name = "tf_file_statistics",
    hdrs = ["tf_file_statistics.h"],
@ -245,6 +266,7 @@ tf_cuda_library(
        "tf_tensor.h",
        "tf_tensor_internal.h",
    ],
+    visibility = ["//tensorflow/c:__subpackages__"],
    deps = select({
        "//tensorflow:android": [
            "//tensorflow/core:android_tensorflow_lib_lite",
@ -253,6 +275,7 @@ tf_cuda_library(
            ":tf_datatype",
            ":tf_status",
            "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
        ],
    }),
 )
@ -279,6 +302,7 @@ tf_cuda_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/common_runtime/eager:attr_builder",
+        "//tensorflow/core/common_runtime/eager:context",
        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
        "//tensorflow/core/platform",
        "@com_google_absl//absl/strings",
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@ -458,7 +458,7 @@ static void TF_Run_Helper(
          EmptyTensor(static_cast<TF_DataType>(src.dtype()), src.shape());
      continue;
    }
-    c_outputs[i] = TF_TensorFromTensor(src, status);
+    c_outputs[i] = TF_TensorFromTensor(src, &status->status);
    if (!status->status.ok()) return;
  }
 }
@ -1493,7 +1493,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
  Tensor t;
  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
  if (!status->status.ok()) return;
-  *value = TF_TensorFromTensor(t, status);
+  *value = TF_TensorFromTensor(t, &status->status);
 }

 void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
@ -1504,7 +1504,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
  if (!status->status.ok()) return;
  const auto len = std::min(max_values, static_cast<int>(ts.size()));
  for (int i = 0; i < len; ++i) {
-    values[i] = TF_TensorFromTensor(ts[i], status);
+    values[i] = TF_TensorFromTensor(ts[i], &status->status);
  }
 }

@ -2398,7 +2398,7 @@ unsigned char TF_TryEvaluateConstant(TF_Graph* graph, TF_Output output,
      graph->graph.versions().producer(), &evaluated, &result_tensor);
  if (evaluated) {
    DCHECK(status->status.ok());
-    *result = TF_TensorFromTensor(result_tensor, status);
+    *result = TF_TensorFromTensor(result_tensor, &status->status);
    if (!status->status.ok()) evaluated = false;
  }
  return evaluated;
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
@ -634,7 +635,7 @@ TF_Tensor* TF_CheckpointReaderGetTensor(TF_CheckpointReader* reader,
  std::unique_ptr<tensorflow::Tensor> tensor;
  reader->GetTensor(name, &tensor, status);
  if (!status->status.ok()) return nullptr;
-  return tensorflow::TF_TensorFromTensor(*tensor, status);
+  return tensorflow::TF_TensorFromTensor(*tensor, &status->status);
 }

 void TF_CheckpointReaderGetVariableShape(TF_CheckpointReader* reader,
@ -767,8 +768,9 @@ tensorflow::Status EnableCollectiveOps(const tensorflow::ServerDef& server_def,
  } while (0);

  // New server created for new server_def. Unused if updating server_def.
+  tensorflow::EagerContext* context = ctx->context;
  tensorflow::GrpcServer* grpc_server =
-      dynamic_cast<tensorflow::GrpcServer*>(ctx->context->GetServer());
+      dynamic_cast<tensorflow::GrpcServer*>(context->GetServer());
  if (grpc_server == nullptr) {
    std::unique_ptr<tensorflow::ServerInterface> new_server;
    LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &new_server));
@ -779,12 +781,12 @@ tensorflow::Status EnableCollectiveOps(const tensorflow::ServerDef& server_def,
    }
    LOG_AND_RETURN_IF_ERROR(grpc_server->Start());

-    LOG_AND_RETURN_IF_ERROR(ctx->context->StoreCollectiveOpsServer(
+    LOG_AND_RETURN_IF_ERROR(context->StoreCollectiveOpsServer(
        std::move(new_server), grpc_server->worker_env()->device_mgr,
        grpc_server->worker_env()->collective_executor_mgr));
  } else {
    LOG_AND_RETURN_IF_ERROR(grpc_server->UpdateServerDef(server_def));
-    LOG_AND_RETURN_IF_ERROR(ctx->context->StoreCollectiveOpsServer(
+    LOG_AND_RETURN_IF_ERROR(context->StoreCollectiveOpsServer(
        /*new_server=*/nullptr, grpc_server->worker_env()->device_mgr,
        grpc_server->worker_env()->collective_executor_mgr));
  }
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@ -1260,11 +1260,10 @@ TEST_F(CApiFunctionTest, GraphToFunctionDefWithPlaceholderAttr) {
  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node3", "v2",
                                &node3);

-  TF_Output inputs[] = {};
  TF_Output outputs[] = {{node1, 0}, {node2, 0}, {node3, 0}};
  func_ = TF_GraphToFunction(
      func_graph.get(), "func", /*append_hash_to_fn_name=*/false, -1,
-      /*opers=*/nullptr, 0, inputs, 3, outputs,
+      /*opers=*/nullptr, 0, nullptr, 3, outputs,
      /*output_names=*/nullptr,
      /*opts=*/nullptr, /*description=*/nullptr, s.get());
  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
@ -1300,10 +1299,9 @@ TEST_F(CApiFunctionTest, GraphToFunctionDefWithArgAttr) {
                     &node);

  TF_Output inputs[] = {{node, 0}};
-  TF_Output outputs[] = {};
  func_ = TF_GraphToFunction(
      func_graph.get(), "func", /*append_hash_to_fn_name=*/false, -1,
-      /*opers=*/nullptr, 1, inputs, 0, outputs,
+      /*opers=*/nullptr, 1, inputs, 0, nullptr,
      /*output_names=*/nullptr,
      /*opts=*/nullptr, /*description=*/nullptr, s.get());
  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
@ -1603,11 +1601,10 @@ void DefineStatefulFunction(const char* name, TF_Function** func) {
  TF_Operation* random =
      RandomUniform(shape, TF_FLOAT, func_graph.get(), s.get());

-  TF_Output inputs[] = {};
  TF_Output outputs[] = {{random, 0}};
  *func = TF_GraphToFunction(func_graph.get(), name,
                             /*append_hash_to_fn_name=*/false, -1,
-                             /*opers=*/nullptr, 0, inputs, 1, outputs,
+                             /*opers=*/nullptr, 0, nullptr, 1, outputs,
                             /*output_names=*/nullptr,
                             /*opts=*/nullptr, "", s.get());
  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@ -188,7 +188,7 @@ namespace tensorflow {

 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);

-TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
+TF_Tensor* TF_TensorFromTensor(const Tensor& src, Status* status);

 Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
                       TF_Buffer* out);
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@ -51,7 +51,7 @@ limitations under the License.
 #include "tensorflow/core/util/equal_graph_def.h"

 namespace tensorflow {
-TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
+TF_Tensor* TF_TensorFromTensor(const Tensor& src, Status* status);
 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);

 namespace {
@ -227,7 +227,7 @@ TEST(CAPI, LibraryLoadFunctions) {

 void TestEncodeDecode(int line, const std::vector<string>& data) {
  const tensorflow::int64 n = data.size();
-  TF_Status* status = TF_NewStatus();
+  Status status;
  for (const std::vector<tensorflow::int64>& dims :
       std::vector<std::vector<tensorflow::int64>>{
           {n}, {1, n}, {n, 1}, {n / 2, 2}}) {
@ -236,8 +236,8 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {
    for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
      src.flat<tstring>()(i) = data[i];
    }
-    TF_Tensor* dst = TF_TensorFromTensor(src, status);
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TF_Tensor* dst = TF_TensorFromTensor(src, &status);
+    ASSERT_TRUE(status.ok()) << status.error_message();

    // Convert back to a C++ Tensor and ensure we get expected output.
    Tensor output;
@ -249,7 +249,6 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {

    TF_DeleteTensor(dst);
  }
-  TF_DeleteStatus(status);
 }

 TEST(CAPI, TensorEncodeDecodeStrings) {
@ -1394,8 +1393,9 @@ TEST(CAPI, SavedModel) {
  TF_Operation* input_op =
      TF_GraphOperationByName(graph, input_op_name.c_str());
  ASSERT_TRUE(input_op != nullptr);
-  csession.SetInputs({{input_op, TF_TensorFromTensor(input, s)}});
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  Status status;
+  csession.SetInputs({{input_op, TF_TensorFromTensor(input, &status)}});
+  ASSERT_TRUE(status.ok()) << status.error_message();

  const tensorflow::string output_op_name(
      tensorflow::ParseTensorName(output_name).first);
@ -2522,12 +2522,11 @@ TEST(CAPI, TestTensorIsNotAligned) {

  // Take an unaligned slice.
  Tensor y = x.Slice(1, 13);
-  TF_Status* status = TF_NewStatus();
-  TF_Tensor* a = TF_TensorFromTensor(y, status);
+  Status status;
+  TF_Tensor* a = TF_TensorFromTensor(y, &status);
  if (EIGEN_MAX_ALIGN_BYTES > 0) {
    EXPECT_FALSE(TF_TensorIsAligned(a));
  }
-  TF_DeleteStatus(status);
  TF_DeleteTensor(a);
 }

--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@ -8,11 +8,11 @@ load(
    "tfe_xla_copts",
 )
 load(
-    "//tensorflow/core/platform:default/build_config.bzl",
+    "//tensorflow/core/platform:build_config.bzl",
    "tf_kernel_tests_linkstatic",
 )
 load(
-    "//tensorflow/core/platform:default/build_config_root.bzl",
+    "//tensorflow/core/platform:build_config_root.bzl",
    "tf_cuda_tests_tags",
 )

@ -28,6 +28,7 @@ tf_cuda_library(
        "c_api_experimental.h",
        "c_api_internal.cc",
        "c_api_internal.h",
+        "tensor_handle_interface.h",
    ],
    hdrs = ["c_api.h"],
    copts = tf_copts() + tfe_xla_copts(),
@ -37,9 +38,11 @@ tf_cuda_library(
            "//tensorflow/core:android_tensorflow_lib_lite",
        ],
        "//conditions:default": [
+            "@com_google_absl//absl/algorithm:container",
            "@com_google_absl//absl/container:fixed_array",
            "//tensorflow/c:c_api",
            "//tensorflow/c:c_api_internal",
+            "//tensorflow/c:tf_tensor_internal",
            "//tensorflow/core:core_cpu",
            "//tensorflow/core/common_runtime/eager:attr_builder",
            "//tensorflow/core/common_runtime/eager:context",
@ -53,6 +56,7 @@ tf_cuda_library(
            "//tensorflow/core:framework_internal",
            "//tensorflow/core:lib",
            "//tensorflow/core:lib_internal",
+            "//tensorflow/core/platform:errors",
            "//tensorflow/core:protos_all_cc",
            "//tensorflow/core/profiler/lib:traceme",
        ],
@ -85,9 +89,25 @@ tf_cuda_library(
    alwayslink = 1,
 )

+filegroup(
+    name = "pywrap_eager_hdrs",
+    srcs = [
+        "c_api_experimental.h",
+        "c_api_internal.h",
+        "tensor_handle_interface.h",
+    ],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
 tf_cuda_library(
    name = "c_api_internal",
-    srcs = ["c_api_experimental.h"],
+    srcs = [
+        "c_api_experimental.h",
+        "tensor_handle_interface.h",
+    ],
    hdrs = ["c_api_internal.h"],
    visibility = [
        "//learning/deepmind/courier:__subpackages__",
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@ -26,10 +26,12 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on

+#include "absl/algorithm/container.h"
 #include "absl/container/fixed_array.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device.h"
@ -38,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/platform.h"  // NOLINT
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
@ -233,7 +236,7 @@ tensorflow::Status GetReplacedFromExistingWorkers(
  std::vector<tensorflow::eager::KeepAliveResponse> responses(
      existing_workers->size());
  for (int i = 0; i < existing_workers->size(); i++) {
-    tensorflow::eager::EagerClient* eager_client;
+    tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
    statuses[i] =
        client_cache->GetClient(existing_workers->at(i), &eager_client);
    if (!statuses[i].ok()) {
@ -282,7 +285,7 @@ tensorflow::Status CreateRemoteContexts(
      continue;
    }

-    tensorflow::eager::EagerClient* eager_client;
+    tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
    statuses[i] = remote_eager_workers->GetClient(remote_worker, &eager_client);
    if (eager_client == nullptr) {
      statuses[i] = tensorflow::errors::Internal(
@ -340,7 +343,7 @@ tensorflow::Status UpdateRemoteContexts(
      continue;
    }

-    tensorflow::eager::EagerClient* eager_client;
+    tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
    statuses[i] = remote_eager_workers->GetClient(remote_worker, &eager_client);
    if (eager_client == nullptr) {
      statuses[i] = tensorflow::errors::Internal(
@ -406,6 +409,7 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(

  // New server created for new server_def. Unused if updating server_def.
  std::unique_ptr<tensorflow::ServerInterface> new_server;
+  tensorflow::EagerContext* context = ctx->context;
  tensorflow::GrpcServer* grpc_server;
  if (reset_context) {
    LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &new_server));
@ -413,26 +417,25 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
    LOG_AND_RETURN_IF_ERROR(
        ListRemoteWorkers(grpc_server, worker_name, &remote_workers));
  } else {
-    LOG_AND_RETURN_IF_ERROR(ListRemoteWorkers(
-        ctx->context->GetServer(), worker_name, &curr_remote_workers));
+    LOG_AND_RETURN_IF_ERROR(ListRemoteWorkers(context->GetServer(), worker_name,
+                                              &curr_remote_workers));
    // No need to check the cast here, since `ListRemoteWorkers` already checks
    // if the server is a GRPC server or not.
-    grpc_server =
-        dynamic_cast<tensorflow::GrpcServer*>(ctx->context->GetServer());
+    grpc_server = dynamic_cast<tensorflow::GrpcServer*>(context->GetServer());
    LOG_AND_RETURN_IF_ERROR(grpc_server->UpdateServerDef(server_def));
    LOG_AND_RETURN_IF_ERROR(
        ListRemoteWorkers(grpc_server, worker_name, &remote_workers));
  }

-  tensorflow::uint64 context_id = ctx->context->GetContextId();
-  tensorflow::uint64 context_view_id = ctx->context->GetContextViewId();
+  tensorflow::uint64 context_id = context->GetContextId();
+  tensorflow::uint64 context_view_id = context->GetContextViewId();
  if (reset_context) {
    context_id = tensorflow::EagerContext::NewContextId();
    context_view_id = 0;
    // Make master eager context accessible by local eager service, which might
    // receive send tensor requests from remote workers.
-    LOG_AND_RETURN_IF_ERROR(grpc_server->AddMasterEagerContextToEagerService(
-        context_id, ctx->context));
+    LOG_AND_RETURN_IF_ERROR(
+        grpc_server->AddMasterEagerContextToEagerService(context_id, context));
  }

  std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers;
@ -461,11 +464,11 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
        &new_remote_device_mgr));
    remote_device_mgr = new_remote_device_mgr.get();
  } else {
-    ctx->context->ClearCaches();
+    context->ClearCachesAndDefaultExecutor();
    // TODO(b/143914772): Potential memory leak if rendezvous has pending
    // tensors for removed / replaced workers.

-    remote_device_mgr = ctx->context->GetOwnedRemoteDeviceMgr();
+    remote_device_mgr = context->GetOwnedRemoteDeviceMgr();
    if (remote_device_mgr == nullptr) {
      LOG_AND_RETURN_IF_ERROR(tensorflow::errors::InvalidArgument(
          "Updating context with an invalid set of remote devices."));
@ -476,8 +479,8 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
                             &added_workers, &removed_workers,
                             &existing_workers);
    LOG_AND_RETURN_IF_ERROR(GetReplacedFromExistingWorkers(
-        &existing_workers, context_id, ctx->context->GetContextViewId(),
-        server_def, remote_eager_workers.get(), &replaced_workers));
+        &existing_workers, context_id, context->GetContextViewId(), server_def,
+        remote_eager_workers.get(), &replaced_workers));
    if (VLOG_IS_ON(1)) {
      VLOG(1) << "Updating cluster with following changes";
      for (const string& w : added_workers) VLOG(1) << "  Added worker " << w;
@ -531,9 +534,8 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
  if (reset_context) {
    LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
        remote_workers, context_id, context_view_id, keep_alive_secs,
-        server_def, remote_eager_workers.get(),
-        ctx->context->Executor().Async(),
-        ctx->context->LazyCopyFunctionRemoteInputs(), base_request));
+        server_def, remote_eager_workers.get(), context->Executor().Async(),
+        context->LazyCopyFunctionRemoteInputs(), base_request));
  } else {
    // The master's context_view_id will be incremented by one
    // the UpdateRemoteMaster call later. We want all new workers and
@ -542,9 +544,8 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
    // context_view_id + 1.
    LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
        added_workers, context_id, context_view_id + 1, keep_alive_secs,
-        server_def, remote_eager_workers.get(),
-        ctx->context->Executor().Async(),
-        ctx->context->LazyCopyFunctionRemoteInputs(), base_request));
+        server_def, remote_eager_workers.get(), context->Executor().Async(),
+        context->LazyCopyFunctionRemoteInputs(), base_request));
    if (!existing_workers.empty()) {
      if (VLOG_IS_ON(1)) {
        for (const string& w : existing_workers) {
@ -575,12 +576,12 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
    TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));

    tensorflow::DistributedFunctionLibraryRuntime* cluster_flr =
-        tensorflow::eager::CreateClusterFLR(context_id, ctx->context,
+        tensorflow::eager::CreateClusterFLR(context_id, context,
                                            worker_session.get());
    auto remote_mgr = absl::make_unique<tensorflow::eager::RemoteMgr>(
-        /*is_master=*/true, ctx->context);
+        /*is_master=*/true, context);

-    LOG_AND_RETURN_IF_ERROR(ctx->context->InitializeRemoteMaster(
+    LOG_AND_RETURN_IF_ERROR(context->InitializeRemoteMaster(
        std::move(new_server), grpc_server->worker_env(), worker_session,
        std::move(remote_eager_workers), std::move(new_remote_device_mgr),
        remote_workers, context_id, r, device_mgr, keep_alive_secs, cluster_flr,
@ -598,9 +599,9 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
        grpc_server->worker_env()->session_mgr->WorkerSessionForSession(
            session_name, &worker_session));
    tensorflow::DistributedFunctionLibraryRuntime* cluster_flr =
-        tensorflow::eager::CreateClusterFLR(context_id, ctx->context,
+        tensorflow::eager::CreateClusterFLR(context_id, context,
                                            worker_session.get());
-    LOG_AND_RETURN_IF_ERROR(ctx->context->UpdateRemoteMaster(
+    LOG_AND_RETURN_IF_ERROR(context->UpdateRemoteMaster(
        grpc_server->worker_env(), std::move(remote_eager_workers),
        added_workers, removed_workers, context_id, r, device_mgr,
        keep_alive_secs, cluster_flr));
@ -627,7 +628,8 @@ tensorflow::Status OpInferSingleInputAttrs(TFE_Op* op,
  }
  const std::string& type_attr = input_def.type_attr();
  if (!type_attr.empty() && ictx->attrs.find(type_attr) == ictx->attrs.end()) {
-    op->operation.MutableAttrs()->Set(type_attr, input->handle->dtype);
+    op->operation.MutableAttrs()->Set(
+        type_attr, static_cast<tensorflow::DataType>(input->handle.DataType()));
    ictx->attrs.insert(type_attr);
  }
  return tensorflow::Status::OK();
@ -635,7 +637,7 @@ tensorflow::Status OpInferSingleInputAttrs(TFE_Op* op,

 void OpInferSingleTypeInputListAttrs(TFE_Op* op,
                                     const tensorflow::OpDef::ArgDef& input_def,
-                                     TFE_TensorHandle** inputs,
+                                     const tensorflow::DataType dtype,
                                     int num_inputs) {
  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
  if (ictx->attrs.find(input_def.number_attr()) == ictx->attrs.end()) {
@ -643,26 +645,20 @@ void OpInferSingleTypeInputListAttrs(TFE_Op* op,
    ictx->attrs.insert(input_def.number_attr());
  }
  if (ictx->attrs.find(input_def.type_attr()) == ictx->attrs.end()) {
-    op->operation.MutableAttrs()->Set(input_def.type_attr(),
-                                      inputs[0]->handle->dtype);
+    op->operation.MutableAttrs()->Set(input_def.type_attr(), dtype);
    ictx->attrs.insert(input_def.type_attr());
  }
 }

-void OpInferMixedTypeInputListAttrs(TFE_Op* op,
-                                    const tensorflow::OpDef::ArgDef& input_def,
-                                    TFE_TensorHandle** inputs, int num_inputs) {
+void OpInferMixedTypeInputListAttrs(
+    TFE_Op* op, const tensorflow::OpDef::ArgDef& input_def,
+    const std::vector<tensorflow::DataType>& dtypes) {
  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
  if (ictx->attrs.find(input_def.type_list_attr()) == ictx->attrs.end()) {
-    std::unique_ptr<tensorflow::DataType[]> dtypes(
-        new tensorflow::DataType[num_inputs]);
-    for (int i = 0; i < num_inputs; ++i) {
-      dtypes[i] = inputs[i]->handle->dtype;
-    }
    op->operation.MutableAttrs()->Set(
        input_def.type_list_attr(),
-        tensorflow::gtl::ArraySlice<const tensorflow::DataType>(dtypes.get(),
-                                                                num_inputs));
+        tensorflow::gtl::ArraySlice<const tensorflow::DataType>(dtypes.data(),
+                                                                dtypes.size()));
    ictx->attrs.insert(input_def.type_list_attr());
  }
 }
@ -672,10 +668,18 @@ tensorflow::Status OpInferInputListAttrs(TFE_Op* op, TFE_TensorHandle** inputs,
  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
  const auto& input_def = ictx->op_def->input_arg(ictx->input_arg_idx++);
  if (!input_def.type_list_attr().empty()) {
-    OpInferMixedTypeInputListAttrs(op, input_def, inputs, num_inputs);
+    std::vector<tensorflow::DataType> dtypes(num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      dtypes[i] =
+          static_cast<const tensorflow::DataType>(inputs[i]->handle.DataType());
+    }
+    OpInferMixedTypeInputListAttrs(op, input_def, dtypes);
  } else if (!input_def.type_attr().empty() &&
             !input_def.number_attr().empty()) {
-    OpInferSingleTypeInputListAttrs(op, input_def, inputs, num_inputs);
+    OpInferSingleTypeInputListAttrs(
+        op, input_def,
+        static_cast<const tensorflow::DataType>(inputs[0]->handle.DataType()),
+        num_inputs);
  } else {
    return tensorflow::errors::InvalidArgument("Invalid input list definition");
  }
@ -717,12 +721,14 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
  tensorflow::Rendezvous* r =
      new tensorflow::IntraProcessRendezvous(device_mgr.get());

-  return new TFE_Context(opts->session_options.options,
-                         opts->device_placement_policy, opts->mirroring_policy,
-                         opts->async, opts->lazy_remote_inputs_copy,
-                         device_mgr.release(),
-                         /*device_mgr_owned*/ true, r,
-                         tensorflow::GetDefaultCustomKernelCreator());
+  return new TFE_Context{new tensorflow::EagerContext(
+      opts->session_options.options,
+      static_cast<tensorflow::ContextDevicePlacementPolicy>(
+          opts->device_placement_policy),
+      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
+      opts->async, opts->lazy_remote_inputs_copy, device_mgr.release(),
+      /*device_mgr_owned*/ true, r,
+      tensorflow::GetDefaultCustomKernelCreator())};
 }

 TFE_Context* TFE_NewContextFromSession(const TFE_ContextOptions* opts,
@ -733,25 +739,33 @@ TFE_Context* TFE_NewContextFromSession(const TFE_ContextOptions* opts,
  tensorflow::Rendezvous* r =
      new tensorflow::IntraProcessRendezvous(device_mgr);

-  return new TFE_Context(opts->session_options.options,
-                         opts->device_placement_policy, opts->mirroring_policy,
-                         opts->async, opts->lazy_remote_inputs_copy, device_mgr,
-                         /*device_mgr_owned*/ false, r,
-                         tensorflow::GetDefaultCustomKernelCreator());
+  return new TFE_Context{new tensorflow::EagerContext(
+      opts->session_options.options,
+      static_cast<tensorflow::ContextDevicePlacementPolicy>(
+          opts->device_placement_policy),
+      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
+      opts->async, opts->lazy_remote_inputs_copy, device_mgr,
+      /*device_mgr_owned*/ false, r,
+      tensorflow::GetDefaultCustomKernelCreator())};
 }

-void TFE_DeleteContext(TFE_Context* ctx) { delete ctx; }
+void TFE_DeleteContext(TFE_Context* ctx) {
+  // context->RefCountIsOne() should be true here.
+  // TODO(iga): Remove EagerContext refcounting.
+  ctx->context->Unref();
+
+  delete ctx;
+}

 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
-  TF_DeviceList* list = new TF_DeviceList;
-  ctx->context->local_device_mgr()->ListDeviceAttributes(&list->response);
-  if (ctx->context->remote_device_mgr()) {
-    ctx->context->remote_device_mgr()->ListDeviceAttributes(&list->response);
-  }
-  return list;
+  TF_DeviceList* l = new TF_DeviceList;
+  ctx->context->ListDevices(&l->response);
+  return l;
 }

-void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context->ClearCaches(); }
+void TFE_ContextClearCaches(TFE_Context* ctx) {
+  ctx->context->ClearCachesAndThreadExecutors();
+}

 // Set server_def on the context, possibly updating it.
 TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
@ -807,8 +821,9 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
      "TFE_ContextSetServerDef not supported on mobile");
  return false;
 #else   // !defined(IS_MOBILE_PLATFORM)
+  tensorflow::EagerContext* context = ctx->context;
  tensorflow::GrpcServer* grpc_server =
-      static_cast<tensorflow::GrpcServer*>(ctx->context->GetServer());
+      static_cast<tensorflow::GrpcServer*>(context->GetServer());

  std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers;
  status->status = grpc_server->master_env()->worker_cache->GetEagerClientCache(
@ -819,7 +834,7 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
  }

  // TODO(yuefengz): support partially specified `worker_name`.
-  tensorflow::eager::EagerClient* eager_client;
+  tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
  status->status = remote_eager_workers->GetClient(worker_name, &eager_client);
  if (!status->status.ok()) {
    return false;
@ -827,7 +842,7 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,

  // Send a rpc request to the worker to check aliveness.
  tensorflow::eager::KeepAliveRequest request;
-  request.set_context_id(ctx->context->GetContextId());
+  request.set_context_id(context->GetContextId());
  tensorflow::eager::KeepAliveResponse response;

  tensorflow::Status keep_alive_status;
@ -882,108 +897,179 @@ void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
  if (h == nullptr) return;
  tensorflow::profiler::TraceMe activity(
      "TFE_DeleteTensorHandle", tensorflow::profiler::TraceMeLevel::kInfo);
-  VLOG(1) << "Deleting tensor handle " << h << " with internal handle "
-          << h->handle;
-  if (h->handle) {
-    h->handle->Unref();
-  }
  delete h;
 }

+tensorflow::TensorHandleInterface::~TensorHandleInterface() {
+  VLOG(1) << "Deleting tensor handle " << this << " with internal handle "
+          << handle_;
+  if (handle_) {
+    handle_->Unref();
+  }
+}
+
+bool tensorflow::TensorHandleInterface::IsValid(Status* status) const {
+  if (handle_ == nullptr) {
+    *status = tensorflow::errors::InvalidArgument(
+        "The passed in handle is a nullptr");
+    return false;
+  }
+
+  return true;
+}
+
 TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h) {
-  return static_cast<TF_DataType>(h->handle->dtype);
+  return h->handle.DataType();
+}
+
+TF_DataType tensorflow::TensorHandleInterface::DataType() const {
+  return static_cast<TF_DataType>(handle_->dtype);
 }

 int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return -1;
  }
+
+  return h->handle.NumDims(&status->status);
+}
+
+int tensorflow::TensorHandleInterface::NumDims(Status* status) const {
+  if (!IsValid(status)) {
+    return -1;
+  }
+
  int result;
-  status->status = h->handle->NumDims(&result);
+  *status = handle_->NumDims(&result);
  return result;
 }

 int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return -1;
  }
+
+  return h->handle.NumElements(&status->status);
+}
+
+int64_t tensorflow::TensorHandleInterface::NumElements(Status* status) const {
+  if (!IsValid(status)) {
+    return -1;
+  }
+
  tensorflow::int64 result;
-  status->status = h->handle->NumElements(&result);
+  *status = handle_->NumElements(&result);
  return result;
 }

 int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index,
                            TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return -1;
  }
+
+  return h->handle.Dim(dim_index, &status->status);
+}
+
+int64_t tensorflow::TensorHandleInterface::Dim(int dim_index,
+                                               Status* status) const {
+  if (!IsValid(status)) {
+    return -1;
+  }
+
  tensorflow::int64 result;
-  status->status = h->handle->Dim(dim_index, &result);
+  *status = handle_->Dim(dim_index, &result);
  return result;
 }

 const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return nullptr;
  }
-  tensorflow::Device* d = h->handle->op_device();
+  return h->handle.DeviceName(&status->status);
+}
+
+const char* tensorflow::TensorHandleInterface::DeviceName(
+    Status* status) const {
+  if (!IsValid(status)) {
+    return nullptr;
+  }
+  tensorflow::Device* d = handle_->op_device();
  return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                        : d->name().c_str();
 }

 const char* TFE_TensorHandleBackingDeviceName(TFE_TensorHandle* h,
                                              TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return nullptr;
  }
-  tensorflow::Device* d = h->handle->device();
+  return h->handle.BackingDeviceName(&status->status);
+}
+
+const char* tensorflow::TensorHandleInterface::BackingDeviceName(
+    Status* status) const {
+  if (!IsValid(status)) {
+    return nullptr;
+  }
+  tensorflow::Device* d = handle_->device();
  return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                        : d->name().c_str();
 }

 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopySharingTensor(
    TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr || !h->handle.IsValid(&status->status)) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return nullptr;
  }

-  h->handle->Ref();
+  return h->handle.Copy();
+}

-  return new TFE_TensorHandle(h->handle);
+TFE_TensorHandle* tensorflow::TensorHandleInterface::Copy() {
+  handle_->Ref();
+  return new TFE_TensorHandle{TensorHandleInterface(handle_)};
 }

 TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
-  if (h == nullptr || h->handle == nullptr) {
+  if (h == nullptr) {
    status->status = tensorflow::errors::InvalidArgument(
        "The passed in handle is a nullptr");
    return nullptr;
  }
-  tensorflow::TensorHandle* handle = h->handle;
+
+  return h->handle.Resolve(&status->status);
+}
+
+TF_Tensor* tensorflow::TensorHandleInterface::Resolve(Status* status) {
+  if (!IsValid(status)) {
+    return nullptr;
+  }

  // TODO(agarwal): move this implementation inside TFE_TensorHandle.
-  if (handle->IsRemote()) {
+  if (handle_->IsRemote()) {
    const tensorflow::Tensor* t = nullptr;
    tensorflow::TensorHandle* h_cpu = nullptr;
-    status->status = EagerCopyToDevice(
-        handle, handle->Context(), &handle->Context()->Executor(),
-        handle->Context()->HostCPU(), false, &h_cpu);
-    if (!status->status.ok()) {
+    *status = EagerCopyToDevice(handle_, handle_->Context(),
+                                &handle_->Context()->Executor(),
+                                handle_->Context()->HostCPU(), false, &h_cpu);
+    if (!status->ok()) {
      return nullptr;
    }
-    status->status = h_cpu->Tensor(&t);
-    if (!status->status.ok()) {
+    *status = h_cpu->Tensor(&t);
+    if (!status->ok()) {
      h_cpu->Unref();
      return nullptr;
    }
@ -992,24 +1078,121 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
    return retval;
  } else {
    tensorflow::Tensor tensor;
-    if (IsCPU(handle->device())) {
+    if (IsCPU(handle_->device())) {
      const tensorflow::Tensor* src = nullptr;
-      status->status = handle->Tensor(&src);
-      if (!status->status.ok()) return nullptr;
+      *status = handle_->Tensor(&src);
+      if (!status->ok()) return nullptr;
      tensor = *src;
    } else {
-      tensorflow::EagerContext* ctx = handle->Context();
+      tensorflow::EagerContext* ctx = handle_->Context();
      CHECK_NE(ctx, nullptr);
-      status->status = h->handle->CopyToDevice(ctx, ctx->HostCPU(), &tensor);
-      if (!status->status.ok()) return nullptr;
+      *status = handle_->CopyToDevice(ctx, ctx->HostCPU(), &tensor);
+      if (!status->ok()) return nullptr;
    }
    return tensorflow::TF_TensorFromTensor(tensor, status);
  }
 }

+void* TFE_TensorHandleDevicePointer(TFE_TensorHandle* h, TF_Status* status) {
+  if (h == nullptr || !h->handle.IsValid(&status->status)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "The passed in handle is a nullptr");
+    return nullptr;
+  }
+  tensorflow::TensorHandle* handle = h->handle.Handle();
+
+  if (handle->IsRemote()) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "TFE_TensorHandleDevicePointer may not be called on a remote tensor "
+        "handle.");
+    return nullptr;
+  }
+  if (handle->device() != nullptr) {
+    status->status = handle->device()->Sync();
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+  }
+  const tensorflow::Tensor* tensor;
+  status->status = handle->Tensor(&tensor);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return const_cast<void*>(
+      static_cast<const void*>(tensor->tensor_data().data()));
+}
+
+TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
+    TFE_Context* ctx, const char* device_name, TF_DataType dtype,
+    const int64_t* dims, int num_dims, void* data, size_t len,
+    void (*deallocator)(void* data, size_t len, void* arg),
+    void* deallocator_arg, TF_Status* status) {
+  tensorflow::Device* device;
+  tensorflow::EagerContext* context = ctx->context;
+  status->status = context->FindDeviceFromName(device_name, &device);
+  if (!status->status.ok()) {
+    deallocator(data, len, deallocator_arg);
+    return nullptr;
+  }
+  std::vector<tensorflow::int64> dimvec(num_dims);
+  for (int i = 0; i < num_dims; ++i) {
+    dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
+  }
+
+  if (dtype == TF_STRING || dtype == TF_RESOURCE ||
+      !tensorflow::DataTypeCanUseMemcpy(
+          static_cast<tensorflow::DataType>(dtype))) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Trying to create a tensor with a pointer to non-pod memory.");
+    deallocator(data, len, deallocator_arg);
+    return nullptr;
+  }
+  // TODO(apassos) do we need to wrap the deallocator here to make sure to sync
+  // the device?
+  TF_ManagedBuffer* buf =
+      new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
+
+  tensorflow::Tensor t(static_cast<tensorflow::DataType>(dtype),
+                       tensorflow::TensorShape(dimvec), buf);
+  buf->Unref();
+  tensorflow::TensorHandle* ret_handle;
+  status->status = tensorflow::TensorHandle::CreateLocalHandle(
+      t, device, context, &ret_handle);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return new TFE_TensorHandle{tensorflow::TensorHandleInterface(ret_handle)};
+}
+
+// This function will block till the operation that produces `h` has
+// completed. This is only valid on local TFE_TensorHandles. Returns the size in
+// bytes of the memory pointed to by the device pointer returned above.
+size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle* h,
+                                        TF_Status* status) {
+  if (h == nullptr || !h->handle.IsValid(&status->status)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "The passed in handle is a nullptr");
+    return 0;
+  }
+  tensorflow::TensorHandle* handle = h->handle.Handle();
+
+  if (handle->IsRemote()) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "TFE_TensorHandleDeviceMemorySize may not be called on a remote tensor "
+        "handle.");
+    return 0;
+  }
+  const tensorflow::Tensor* tensor;
+  status->status = handle->Tensor(&tensor);
+  if (!status->status.ok()) {
+    return 0;
+  }
+  return tensor->TotalBytes();
+}
+
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                  TF_Status* status) {
-  return NewOrResetOp(ctx, op_or_function_name, status,
+  return NewOrResetOp(ctx, op_or_function_name, nullptr, status,
                      /* op_to_reset= */ nullptr);
 }

@ -1035,16 +1218,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }

 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
-  op->operation.AddInput(input->handle);
-  if (op->inference_ctx) {
-    status->status = OpInferSingleInputAttrs(op, input);
+  return op->AddInput(input, status);
+}
+
+void TFE_Op::AddInput(TFE_TensorHandle* input, TF_Status* status) {
+  operation.AddInput(input->handle.Handle());
+  if (inference_ctx) {
+    status->status = OpInferSingleInputAttrs(this, input);
  }
 }

 void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
                        TF_Status* status) {
  for (int i = 0; i < num_inputs; ++i) {
-    op->operation.AddInput(inputs[i]->handle);
+    op->operation.AddInput(inputs[i]->handle.Handle());
  }
  if (op->inference_ctx) {
    status->status = OpInferInputListAttrs(op, inputs, num_inputs);
@ -1282,14 +1469,20 @@ TF_CAPI_EXPORT extern int TFE_OpGetOutputLength(TFE_Op* op,
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                 TF_Status* status) {
  VLOG(1) << "Calling TFE_Execute() on op " << op;
+  op->Execute(retvals, num_retvals, status);
+}
+
+void TFE_Op::Execute(TFE_TensorHandle** retvals, int* num_retvals,
+                     TF_Status* status) {
  absl::FixedArray<tensorflow::TensorHandle*> handle_retvals(*num_retvals);
-  status->status = tensorflow::EagerExecute(&op->operation,
-                                            handle_retvals.data(), num_retvals);
+  status->status =
+      tensorflow::EagerExecute(&operation, handle_retvals.data(), num_retvals);
  if (!status->status.ok()) {
    return;
  }
  for (int i = 0; i < *num_retvals; ++i) {
-    retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
+    retvals[i] = new TFE_TensorHandle{
+        tensorflow::TensorHandleInterface(handle_retvals[i])};
  }
 }

@ -1299,15 +1492,16 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
                                               TF_Status* status) {
  tensorflow::TensorHandle* handle = nullptr;
  tensorflow::Device* device;
-  status->status = ctx->context->FindDeviceFromName(device_name, &device);
+  tensorflow::EagerContext* context = ctx->context;
+  status->status = context->FindDeviceFromName(device_name, &device);
  if (!status->status.ok()) {
    return nullptr;
  }
-  status->status = tensorflow::EagerCopyToDevice(h->handle, ctx->context,
-                                                 &ctx->context->Executor(),
-                                                 device, false, &handle);
+  status->status = tensorflow::EagerCopyToDevice(h->handle.Handle(), context,
+                                                 &context->Executor(), device,
+                                                 false, &handle);
  if (status->status.ok()) {
-    return new TFE_TensorHandle(handle);
+    return new TFE_TensorHandle{tensorflow::TensorHandleInterface(handle)};
  }
  return nullptr;
 }
@ -1355,11 +1549,12 @@ TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t,

 void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                  TF_Status* status) {
-  status->status = ctx->context->Executor().WaitForAllPendingNodes();
+  tensorflow::EagerContext* context = ctx->context;
+  status->status = context->Executor().WaitForAllPendingNodes();
  if (!status->status.ok()) return;
-  tensorflow::mutex_lock ml(*ctx->context->MetadataMu());
-  status->status = MessageToBuffer(*ctx->context->RunMetadataProto(), buf);
-  ctx->context->ClearRunMetadata();
+  tensorflow::mutex_lock ml(*context->MetadataMu());
+  status->status = MessageToBuffer(*context->RunMetadataProto(), buf);
+  context->ClearRunMetadata();
 }

 namespace {
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@ -206,7 +206,7 @@ typedef struct TFE_TensorDebugInfo TFE_TensorDebugInfo;
 // error and nullptr is returned. This function can block till the operation
 // that produces `handle` has completed.
 TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
-    TFE_TensorHandle* handle, TF_Status* status);
+    TFE_TensorHandle* h, TF_Status* status);

 // Deletes `debug_info`.
 TF_CAPI_EXPORT extern void TFE_DeleteTensorDebugInfo(
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@ -28,19 +28,22 @@ using tensorflow::string;

 namespace {

-std::vector<int64> TensorShapeAsVector(TFE_TensorHandle* handle,
-                                       TF_Status* status) {
+std::vector<int64> TensorShapeAsVector(const tensorflow::TensorHandle& handle,
+                                       tensorflow::Status* status) {
  std::vector<int64> shape;
-  int rank = TFE_TensorHandleNumDims(handle, status);
-  if (TF_GetCode(status) != TF_OK) {
+  int rank = -1;
+  *status = handle.NumDims(&rank);
+  if (!status->ok()) {
    return shape;
  }
  shape.reserve(rank);
  for (int i = 0; i < rank; ++i) {
-    shape.push_back(TFE_TensorHandleDim(handle, i, status));
-    if (TF_GetCode(status) != TF_OK) {
+    tensorflow::int64 dim;
+    *status = handle.Dim(i, &dim);
+    if (!status->ok()) {
      return shape;
    }
+    shape.push_back(dim);
  }
  return shape;
 }
@ -50,15 +53,20 @@ std::vector<int64> TensorShapeAsVector(TFE_TensorHandle* handle,
 extern "C" {

 TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
-    TFE_TensorHandle* handle, TF_Status* status) {
+    TFE_TensorHandle* h, TF_Status* status) {
+  return h->handle.TensorDebugInfo(&status->status);
+}
+
+TFE_TensorDebugInfo* tensorflow::TensorHandleInterface::TensorDebugInfo(
+    Status* status) {
  const tensorflow::Tensor* tensor;
-  status->status = handle->handle->Tensor(&tensor);
-  if (TF_GetCode(status) != TF_OK) {
+  *status = handle_->Tensor(&tensor);
+  if (!status->ok()) {
    return nullptr;
  }

 #ifdef TENSORFLOW_EAGER_USE_XLA
-  tensorflow::Device* device = handle->handle->device();
+  tensorflow::Device* device = handle_->device();

  // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
  tensorflow::XlaDevice* xla_device =
@ -67,15 +75,15 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
    tensorflow::XlaDevice::PaddedShapeFn shape_fn =
        xla_device->metadata().padded_shape_fn();
    xla::Shape padded_shape;
-    status->status = shape_fn(*tensor, &padded_shape);
-    if (!status->status.ok()) {
+    *status = shape_fn(*tensor, &padded_shape);
+    if (!status->ok()) {
      return nullptr;
    }
    if (VLOG_IS_ON(3)) {
-      std::vector<int64> shape_to_log = TensorShapeAsVector(handle, status);
-      if (!status->status.ok()) {
+      std::vector<int64> shape_to_log = TensorShapeAsVector(*handle_, status);
+      if (!status->ok()) {
        // Ignore the status here as we are simply logging.
-        status->status = tensorflow::Status::OK();
+        *status = tensorflow::Status::OK();
      } else {
        VLOG(3) << "Fully padded shape of ["
                << absl::StrJoin(shape_to_log, ", ") << "] is "
@ -88,7 +96,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
        // Currently, the only case of XlaTensor containing a tuple shape is to
        // represent 64 bit ints, doubles, and complex numbers (we don't support
        // 64bit complex numbers).
-        status->status = tensorflow::errors::InvalidArgument(
+        *status = tensorflow::errors::InvalidArgument(
            "XlaTensors should only contain tuples of size 2. Shape: ",
            padded_shape.DebugString());
        return nullptr;
@ -100,13 +108,13 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
      const xla::Shape& shape1 =
          xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
      if (shape0.IsTuple() || shape1.IsTuple()) {
-        status->status = tensorflow::errors::InvalidArgument(
+        *status = tensorflow::errors::InvalidArgument(
            "XlaTensors should not contain nested tuples. Shape: ",
            padded_shape.DebugString());
        return nullptr;
      }
      if (!xla::ShapeUtil::Equal(shape0, shape1)) {
-        status->status = tensorflow::errors::InvalidArgument(
+        *status = tensorflow::errors::InvalidArgument(
            "Subshapes of XlaTensors should be the same. Shape: ",
            padded_shape.DebugString());
        return nullptr;
@ -131,15 +139,15 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
        dev_dims.push_back(padded_shape.dimensions(dim_index));
      }
    }
-    status->status = tensorflow::Status::OK();
+    *status = tensorflow::Status::OK();
    return new TFE_TensorDebugInfo(dev_dims);
  }
 #endif  // TENSORFLOW_EAGER_USE_XLA

  // If the tensor is not an XLA tensor, the device shape is
  // the same as regular tensor shape.
-  std::vector<int64> dev_dims = TensorShapeAsVector(handle, status);
-  if (TF_GetCode(status) != TF_OK) {
+  std::vector<int64> dev_dims = TensorShapeAsVector(*handle_, status);
+  if (!status->ok()) {
    return nullptr;
  }
  return new TFE_TensorDebugInfo(dev_dims);
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@ -29,9 +29,11 @@ limitations under the License.
 using tensorflow::string;

 void TFE_OpReset(TFE_Context* ctx, const char* op_or_function_name,
-                 TF_Status* status, TFE_Op* op_to_reset) {
+                 const char* raw_device_name, TF_Status* status,
+                 TFE_Op* op_to_reset) {
  if (op_to_reset) {
-    NewOrResetOp(ctx, op_or_function_name, status, op_to_reset);
+    NewOrResetOp(ctx, op_or_function_name, raw_device_name, status,
+                 op_to_reset);
  } else {
    TF_SetStatus(status, TF_INVALID_ARGUMENT,
                 "op_to_reset should not be nullptr");
@ -39,7 +41,7 @@ void TFE_OpReset(TFE_Context* ctx, const char* op_or_function_name,
 }

 void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  op->operation.ConsumeInput(h->handle);
+  op->operation.ConsumeInput(h->handle.Handle());
 }

 TFE_Profiler* TFE_NewProfiler() { return new TFE_Profiler(); }
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@ -22,8 +22,16 @@ limitations under the License.
 extern "C" {
 #endif

+// Resets `op_to_reset` with `op_or_function_name` and `raw_device_name`. This
+// is for performance optimization by reusing an exiting unused op rather than
+// creating a new op every time. If `raw_device_name` is `NULL` or empty, it
+// does not set the device name. If it's not `NULL`, then it attempts to parse
+// and set the device name. It's effectively `TFE_OpSetDevice`, but it is faster
+// than seperately calling it because if the existing op has the same
+// `raw_device_name`, it skips parsing and just leave as it is.
 TF_CAPI_EXPORT extern void TFE_OpReset(TFE_Context* ctx,
                                       const char* op_or_function_name,
+                                       const char* raw_device_name,
                                       TF_Status* status, TFE_Op* op_to_reset);

 TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
@ -426,6 +434,30 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
                                                 const char* worker_name,
                                                 TF_Status* status);

+// This function will block till the operation that produces `h` has
+// completed. This is only valid on local TFE_TensorHandles. The pointer
+// returned will be on the device in which the TFE_TensorHandle resides (so e.g.
+// for a GPU tensor this will return a pointer to GPU memory). The pointer is
+// only guaranteed to be valid until TFE_DeleteTensorHandle is called on this
+// TensorHandle. Only supports POD data types.
+TF_CAPI_EXPORT extern void* TFE_TensorHandleDevicePointer(TFE_TensorHandle*,
+                                                          TF_Status*);
+
+// This function will block till the operation that produces `h` has
+// completed. This is only valid on local TFE_TensorHandles. Returns the size in
+// bytes of the memory pointed to by the device pointer returned above.
+TF_CAPI_EXPORT extern size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle*,
+                                                              TF_Status*);
+
+// Creates a new TensorHandle from memory residing in device_name. Takes
+// ownership of the memory, and will call deleter to release it after TF
+// no longer needs it or in case of error.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
+    TFE_Context* ctx, const char* device_name, TF_DataType, const int64_t* dims,
+    int num_dims, void* data, size_t len,
+    void (*deallocator)(void* data, size_t len, void* arg),
+    void* deallocator_arg, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@ -495,5 +495,54 @@ void Executor_MatMul_CPU(bool async) {
 TEST(CAPI, Executor_MatMul_CPU) { Executor_MatMul_CPU(false); }
 TEST(CAPI, Executor_MatMul_CPUAsync) { Executor_MatMul_CPU(true); }

+void Deleter(void* data, size_t unused, void* tensor_handle) {
+  TFE_DeleteTensorHandle(static_cast<TFE_TensorHandle*>(tensor_handle));
+}
+
+TEST(CAPI, TensorHandleOnDeviceMemory) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TF_Tensor* m_data = TFE_TensorHandleResolve(m, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float* m_float = static_cast<float*>(TF_TensorData(m_data));
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_devices = TF_DeviceListCount(devices);
+  for (int d = 0; d < num_devices; ++d) {
+    const char* name = TF_DeviceListName(devices, d, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_TensorHandle* copy = TFE_TensorHandleCopyToDevice(m, ctx, name, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    void* data = TFE_TensorHandleDevicePointer(copy, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    size_t size = TFE_TensorHandleDeviceMemorySize(copy, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    int64_t dims[] = {2, 2};
+    TFE_TensorHandle* copy_aliased = TFE_NewTensorHandleFromDeviceMemory(
+        ctx, name, TF_FLOAT, dims, 2, data, size, &Deleter, copy, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_TensorHandle* on_host =
+        TFE_TensorHandleCopyToDevice(copy_aliased, ctx, "CPU:0", status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TF_Tensor* resolved = TFE_TensorHandleResolve(on_host, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    const float* resolved_data =
+        static_cast<const float*>(TF_TensorData(resolved));
+    EXPECT_EQ(0, memcmp(m_float, resolved_data, 4 * sizeof(float)));
+    TF_DeleteTensor(resolved);
+    TFE_DeleteTensorHandle(copy_aliased);  // Note that this will delete copy.
+    TFE_DeleteTensorHandle(on_host);
+  }
+  TF_DeleteTensor(m_data);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/c/eager/c_api_internal.cc
+++ b/tensorflow/c/eager/c_api_internal.cc
@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/eager/c_api_internal.h"

+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"

 TFE_Op* NewOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
-                     TF_Status* status, TFE_Op* op_to_reset) {
+                     const char* raw_device_name, TF_Status* status,
+                     TFE_Op* op_to_reset) {
  const char* name = op_or_function_name;  // Shorthand
  const tensorflow::AttrTypeMap* types;
  bool is_function = false;
@ -25,26 +27,22 @@ TFE_Op* NewOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
  if (!status->status.ok()) {
    return nullptr;
  }
-  auto create_or_reset = [&op_to_reset, &ctx, &name, &types](
-                             bool is_function,
-                             TFE_OpInferenceContext* inference_ctx) -> TFE_Op* {
-    if (op_to_reset) {
-      op_to_reset->Reset(ctx, name, is_function, types, inference_ctx);
-      return op_to_reset;
-    } else {
-      return new TFE_Op(ctx, name, is_function, types, inference_ctx);
-    }
-  };

+  if (op_to_reset && op_to_reset->ctx != ctx) {
+    status->status = tensorflow::errors::Internal(
+        "Cannot reset a TFE_Op from another TFE_Context");
+    return nullptr;
+  }
+
+  std::unique_ptr<TFE_OpInferenceContext> inference_ctx;
  if (!is_function) {
    const tensorflow::OpDef* op_def;
    status->status = tensorflow::OpDefForOp(op_or_function_name, &op_def);
    if (!status->status.ok()) {
      return nullptr;
    }
-    return create_or_reset(false, new TFE_OpInferenceContext(op_def));
-  }
-  if (!ctx->context->FindFunctionByName(name)) {
+    inference_ctx.reset(new TFE_OpInferenceContext(op_def));
+  } else if (!ctx->context->FindFunctionByName(name)) {
    status->status = tensorflow::errors::NotFound(
        "'", name,
        "' is neither a type of a primitive operation nor a name "
@ -54,5 +52,15 @@ TFE_Op* NewOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
        "registered in the binary running in this process.");
    return nullptr;
  }
-  return create_or_reset(true, nullptr);
+
+  if (op_to_reset) {
+    status->status = op_to_reset->Reset(
+        name, is_function, types, raw_device_name, std::move(inference_ctx));
+    return op_to_reset;
+  }
+
+  TFE_Op* new_op =
+      new TFE_Op(ctx, name, is_function, types, std::move(inference_ctx));
+  status->status = new_op->operation.SetDeviceName(raw_device_name);
+  return new_op;
 }
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@ -58,40 +59,14 @@ struct TFE_ContextOptions {
      TFE_DEVICE_PLACEMENT_SILENT};
  TFE_ContextMirroringPolicy mirroring_policy{TFE_MIRRORING_NONE};
  // If true, lazily copy the remote inputs of a function to the target devices.
-  bool lazy_remote_inputs_copy = false;
+  bool lazy_remote_inputs_copy = true;
 };

 struct TFE_Context {
-  TFE_Context(const tensorflow::SessionOptions& opts,
-              TFE_ContextDevicePlacementPolicy default_device_placement_policy,
-              TFE_ContextMirroringPolicy default_mirroring_policy, bool async,
-              const bool lazy_remote_inputs_copy,
-              const tensorflow::DeviceMgr* device_mgr, bool device_mgr_owned,
-              tensorflow::Rendezvous* rendezvous,
-              const tensorflow::CustomKernelCreator* custom_kernel_creator)
-      : context(new tensorflow::EagerContext(
-            opts,
-            static_cast<tensorflow::ContextDevicePlacementPolicy>(
-                default_device_placement_policy),
-            static_cast<tensorflow::ContextMirroringPolicy>(
-                default_mirroring_policy),
-            async, lazy_remote_inputs_copy, device_mgr, device_mgr_owned,
-            rendezvous, custom_kernel_creator)) {}
-
-  ~TFE_Context() {
-    // TODO(iga): Add a separate API method to shutdown TFE_Context so that we
-    // don't send RPCs and block in destructor.
-    context->WaitForAndCloseRemoteContexts();
-    // context->RefCountIsOne() should be true here.
-    // TODO(iga): Remove EagerContext refcounting.
-    context->Unref();
-  }
-
  tensorflow::EagerContext* context;
 };

 struct TFE_TensorHandle {
-  explicit TFE_TensorHandle(tensorflow::TensorHandle* h) : handle(h) {}
  static TFE_TensorHandle* CreateLocalHandle(const class tensorflow::Tensor& t,
                                             TF_Status* s) {
    tensorflow::TensorHandle* handle;
@ -99,10 +74,10 @@ struct TFE_TensorHandle {
    if (!s->status.ok()) {
      return nullptr;
    }
-    return new TFE_TensorHandle(handle);
+    return new TFE_TensorHandle{tensorflow::TensorHandleInterface(handle)};
  }

-  tensorflow::TensorHandle* handle;
+  tensorflow::TensorHandleInterface handle;
 };

 struct TFE_TensorDebugInfo {
@ -125,28 +100,36 @@ struct TFE_OpInferenceContext {
 struct TFE_Op {
  TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
         const tensorflow::AttrTypeMap* t,
-         TFE_OpInferenceContext* inference_ctx)
-      : operation(ctx->context, op, is_function, t),
-        inference_ctx(inference_ctx) {}
+         std::unique_ptr<TFE_OpInferenceContext> inference_ctx)
+      : ctx(ctx),
+        operation(ctx->context, op, is_function, t),
+        inference_ctx(std::move(inference_ctx)) {}

  void Clear() {
    operation.Clear();
    inference_ctx.reset();
  }

-  void Reset(TFE_Context* ctx, const char* op, bool is_function,
-             const tensorflow::AttrTypeMap* t,
-             TFE_OpInferenceContext* infer_ctx) {
-    operation.Reset(ctx->context, op, is_function, t, nullptr);
-    inference_ctx.reset(infer_ctx);
+  tensorflow::Status Reset(const char* op, bool is_function,
+                           const tensorflow::AttrTypeMap* t,
+                           const char* raw_device_name,
+                           std::unique_ptr<TFE_OpInferenceContext> infer_ctx) {
+    inference_ctx = std::move(infer_ctx);
+    return operation.Reset(ctx->context, op, is_function, t, raw_device_name,
+                           nullptr);
  }

+  void AddInput(TFE_TensorHandle* input, TF_Status* status);
+  void Execute(TFE_TensorHandle** retvals, int* num_retvals, TF_Status* status);
+
+  TFE_Context* ctx;
  tensorflow::EagerOperation operation;
  std::unique_ptr<TFE_OpInferenceContext> inference_ctx;
 };

 TFE_Op* NewOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
-                     TF_Status* status, TFE_Op* op_to_reset = nullptr);
+                     const char* raw_device_name, TF_Status* status,
+                     TFE_Op* op_to_reset = nullptr);

 struct TFE_Profiler {
  explicit TFE_Profiler() { profiler = tensorflow::ProfilerSession::Create(); }
--- a/tensorflow/c/eager/tensor_handle_interface.h
+++ b/tensorflow/c/eager/tensor_handle_interface.h
@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
+#define TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+namespace tensorflow {
+
+class TensorHandleInterface {
+ public:
+  explicit TensorHandleInterface(TensorHandle* h) : handle_(h) {}
+  ~TensorHandleInterface();
+
+  bool IsValid(Status* status) const;
+  TF_DataType DataType() const;
+  int NumDims(Status* status) const;
+  int64_t NumElements(Status* status) const;
+  int64_t Dim(int dim_index, Status* status) const;
+
+  const char* DeviceName(Status* status) const;
+  const char* BackingDeviceName(Status* status) const;
+  TFE_TensorHandle* Copy();
+  TF_Tensor* Resolve(Status* status);
+  TFE_TensorDebugInfo* TensorDebugInfo(Status* status);
+
+  // TODO(gjn): This is not a very generic interface, but is needed for specific
+  // use cases.
+  TensorHandle* Handle() { return handle_; }
+
+ private:
+  TensorHandle* handle_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
@ -215,9 +215,24 @@ Status ModularFileSystem::DeleteFile(const std::string& fname) {
 Status ModularFileSystem::DeleteRecursively(const std::string& dirname,
                                            int64* undeleted_files,
                                            int64* undeleted_dirs) {
-  // TODO(mihaimaruseac): Implementation to come in a new change
-  return Status(error::UNIMPLEMENTED,
-                "Modular filesystem stub not implemented yet");
+  if (undeleted_files == nullptr || undeleted_dirs == nullptr)
+    return errors::FailedPrecondition(
+        "DeleteRecursively must not be called with `undeleted_files` or "
+        "`undeleted_dirs` set to NULL");
+
+  if (ops_->delete_recursively == nullptr)
+    return FileSystem::DeleteRecursively(dirname, undeleted_files,
+                                         undeleted_dirs);
+
+  UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
+  std::string translated_name = TranslateName(dirname);
+  uint64_t plugin_undeleted_files, plugin_undeleted_dirs;
+  ops_->delete_recursively(filesystem_.get(), translated_name.c_str(),
+                           &plugin_undeleted_files, &plugin_undeleted_dirs,
+                           plugin_status.get());
+  *undeleted_files = plugin_undeleted_files;
+  *undeleted_dirs = plugin_undeleted_dirs;
+  return StatusFromTF_Status(plugin_status.get());
 }

 Status ModularFileSystem::DeleteDir(const std::string& dirname) {
@ -233,9 +248,14 @@ Status ModularFileSystem::DeleteDir(const std::string& dirname) {
 }

 Status ModularFileSystem::RecursivelyCreateDir(const std::string& dirname) {
-  // TODO(mihaimaruseac): Implementation to come in a new change
-  return Status(error::UNIMPLEMENTED,
-                "Modular filesystem stub not implemented yet");
+  if (ops_->recursively_create_dir == nullptr)
+    return FileSystem::RecursivelyCreateDir(dirname);
+
+  UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
+  std::string translated_name = TranslateName(dirname);
+  ops_->recursively_create_dir(filesystem_.get(), translated_name.c_str(),
+                               plugin_status.get());
+  return StatusFromTF_Status(plugin_status.get());
 }

 Status ModularFileSystem::CreateDir(const std::string& dirname) {
@ -324,8 +344,8 @@ Status ModularFileSystem::CopyFile(const std::string& src,
  if (ops_->copy_file == nullptr) return FileSystem::CopyFile(src, target);

  UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
-  const std::string& translated_src = TranslateName(src);
-  const std::string& translated_target = TranslateName(target);
+  std::string translated_src = TranslateName(src);
+  std::string translated_target = TranslateName(target);
  ops_->copy_file(filesystem_.get(), translated_src.c_str(),
                  translated_target.c_str(), plugin_status.get());
  return StatusFromTF_Status(plugin_status.get());
--- a/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem_test.cc
--- a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
@ -1,35 +1,47 @@
 # Experimental posix filesystem plugin.
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")

 package(
+    default_visibility = ["//visibility:private"],
    licenses = ["notice"],  # Apache 2.0
 )

-# Although this target results in a shared object that will be loaded at
-# runtime, this target must be a `cc_library` instead of a `cc_binary`. Making
-# it a `cc_binary` requires `linkshared = True`. In turn, this brings in several
-# TensorFlow symbols under `tensorflow::` namespace, for which we have no ABI
-# guarantees. Hence, in order to maintain ABI compatibility, this is marked as a
-# `cc_library` for now and we will revisit in the future.
-# TODO(mihaimaruseac): Determine if `cc_binary` makes more sense (when all
-# filesystems are converted and BUILD files are refactored to be modular).
-# TODO(b/144585140): The helpers should be separated into a different BUILD target
-# but doing that would result in symbols not being visible when loading plugin.
-# Revisit this once POSIX filesystem completely lands. See also the other TODO.
-# This also has the unfortunate effect that both versions of copy_file get
-# compiled, regardless of which one actually gets used!
+# Filesystem implementation for POSIX environments: Linux, MacOS, Android, etc.
+tf_cc_shared_object(
+    name = "libposix_filesystem.so",
+    framework_so = [],
+    linkstatic = False,
+    visibility = ["//visibility:public"],
+    deps = [":posix_filesystem_impl"],
+)
+
+# The real implementation of the filesystem.
 cc_library(
-    name = "posix_filesystem",
-    srcs = [
-        "posix_filesystem.cc",
-        "posix_filesystem_helper.cc",
-        "posix_filesystem_helper.h",
-        "copy_file.h",
-    ] + select({
-        "//tensorflow:android": ["copy_file_portable.cc"],
-        "//conditions:default": ["copy_file_linux.cc"],
-    }),
+    name = "posix_filesystem_impl",
+    srcs = ["posix_filesystem.cc"],
    deps = [
+        ":posix_filesystem_helper",
        "//tensorflow/c:tf_status",
        "//tensorflow/c/experimental/filesystem:filesystem_interface",
    ],
 )
+
+# Library implementing helper functionality, so that the above only contains
+# the API implementation for modular filesystems.
+cc_library(
+    name = "posix_filesystem_helper",
+    srcs = ["posix_filesystem_helper.cc"],
+    hdrs = ["posix_filesystem_helper.h"],
+    deps = [":copy_file"],
+)
+
+# On Linux, we can copy files faster using `sendfile`. But not elsewhere.
+# Hence, this private library to select which implementation to use.
+cc_library(
+    name = "copy_file",
+    srcs = select({
+        "//tensorflow:linux_x86_64": ["copy_file_linux.cc"],
+        "//conditions:default": ["copy_file_portable.cc"],
+    }),
+    hdrs = ["copy_file.h"],
+)
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@ -181,7 +181,8 @@ void TF_GetInput(TF_OpKernelContext* ctx, int i, TF_Tensor** tensor,
    return;
  }
  const ::tensorflow::Tensor& cc_tensor(cc_ctx->input(i));
-  TF_Tensor* result = ::tensorflow::TF_TensorFromTensor(cc_tensor, status);
+  TF_Tensor* result =
+      ::tensorflow::TF_TensorFromTensor(cc_tensor, &status->status);
  if (TF_GetCode(status) == TF_OK) {
    *tensor = result;
  }
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@ -62,39 +62,6 @@ void deallocate_buffer(void* data, size_t len, void* arg) {
 }
 }  // namespace tensorflow

-namespace {
-class TF_ManagedBuffer : public TensorBuffer {
- public:
-  TF_ManagedBuffer(void* data, size_t len,
-                   void (*deallocator)(void* data, size_t len, void* arg),
-                   void* deallocator_arg)
-      : TensorBuffer(data),
-        len_(len),
-        deallocator_(deallocator),
-        deallocator_arg_(deallocator_arg) {}
-
-  const size_t len_;
-  void (*const deallocator_)(void* data, size_t len, void* arg);
-  void* const deallocator_arg_;
-
-  ~TF_ManagedBuffer() override {
-    (*deallocator_)(data(), len_, deallocator_arg_);
-  }
-
-  size_t size() const override { return len_; }
-  TensorBuffer* root_buffer() override { return this; }
-  void FillAllocationDescription(
-      tensorflow::AllocationDescription* proto) const override {
-    tensorflow::int64 rb = size();
-    proto->set_requested_bytes(rb);
-    proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
-  }
-
-  // Prevents input forwarding from mutating this buffer.
-  bool OwnsMemory() const override { return false; }
-};
-
-}  // namespace

 TF_Tensor* TF_AllocateTensor(TF_DataType dtype, const int64_t* dims,
                             int num_dims, size_t len) {
@ -136,9 +103,9 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
    buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
  }

-  TF_Tensor* ret =
-      new TF_Tensor{Tensor(static_cast<tensorflow::DataType>(dtype),
-                           tensorflow::TensorShape(dimvec), buf)};
+  TF_Tensor* ret = new TF_Tensor{tensorflow::TensorInterface(
+      Tensor(static_cast<tensorflow::DataType>(dtype),
+             tensorflow::TensorShape(dimvec), buf))};
  buf->Unref();
  size_t elem_size = TF_DataTypeSize(dtype);
  if (elem_size > 0 && len < (elem_size * ret->tensor.NumElements())) {
@ -148,37 +115,23 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
  return ret;
 }

-TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
-  // It is safe to move the Tensor if and only if we own the unique reference to
-  // it. In that case, we might as well not delete and reallocate, but a future
-  // implementation might need to do so.
-  TensorBuffer* buf = tensorflow::TensorCApi::Buffer(tensor->tensor);
-  if (buf->RefCountIsOne() && buf->root_buffer()->RefCountIsOne() &&
-      buf->OwnsMemory()) {
-    return tensor;
-  }
-  return nullptr;
+TF_Tensor* TF_TensorMaybeMove(TF_Tensor* t) {
+  return t->tensor.CanMove() ? t : nullptr;
 }

 void TF_DeleteTensor(TF_Tensor* t) { delete t; }

-TF_DataType TF_TensorType(const TF_Tensor* t) {
-  return static_cast<TF_DataType>(t->tensor.dtype());
-}
+TF_DataType TF_TensorType(const TF_Tensor* t) { return t->tensor.Type(); }

-int TF_NumDims(const TF_Tensor* t) { return t->tensor.dims(); }
+int TF_NumDims(const TF_Tensor* t) { return t->tensor.NumDims(); }

 int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
-  return static_cast<int64_t>(t->tensor.dim_size(dim_index));
+  return t->tensor.Dim(dim_index);
 }

-size_t TF_TensorByteSize(const TF_Tensor* t) {
-  return tensorflow::TensorCApi::Buffer(t->tensor)->size();
-}
+size_t TF_TensorByteSize(const TF_Tensor* t) { return t->tensor.ByteSize(); }

-void* TF_TensorData(const TF_Tensor* t) {
-  return tensorflow::TensorCApi::Buffer(t->tensor)->data();
-}
+void* TF_TensorData(const TF_Tensor* t) { return t->tensor.Data(); }

 int64_t TF_TensorElementCount(const TF_Tensor* t) {
  int64_t result = 1;
@ -193,16 +146,66 @@ void TF_TensorBitcastFrom(const TF_Tensor* from, TF_DataType type,
                          TF_Tensor* to, const int64_t* new_dims,
                          int num_new_dims, TF_Status* status) {
  TF_SetStatus(status, TF_OK, "");
+  Status cc_status(
+      to->tensor.BitcastFrom(from->tensor, type, new_dims, num_new_dims));
+  Set_TF_Status_from_Status(status, cc_status);
+}
+
+namespace tensorflow {
+
+bool TensorInterface::CanMove() const {
+  // It is safe to move the Tensor if and only if we own the unique reference to
+  // it. In that case, we might as well not delete and reallocate, but a future
+  // implementation might need to do so.
+  TensorBuffer* buf = tensorflow::TensorCApi::Buffer(tensor_);
+  if (buf->RefCountIsOne() && buf->root_buffer()->RefCountIsOne() &&
+      buf->OwnsMemory()) {
+    return true;
+  }
+  return false;
+}
+
+TF_DataType TensorInterface::Type() const {
+  return static_cast<TF_DataType>(tensor_.dtype());
+}
+
+int TensorInterface::NumDims() const { return tensor_.dims(); }
+
+int64_t TensorInterface::Dim(int dim_index) const {
+  return static_cast<int64_t>(tensor_.dim_size(dim_index));
+}
+
+int64_t TensorInterface::NumElements() const {
+  return static_cast<int64_t>(tensor_.NumElements());
+}
+
+size_t TensorInterface::ByteSize() const {
+  return tensorflow::TensorCApi::Buffer(tensor_)->size();
+}
+
+void* TensorInterface::Data() const {
+  return tensorflow::TensorCApi::Buffer(tensor_)->data();
+}
+
+Status TensorInterface::BitcastFrom(const TensorInterface& from,
+                                    TF_DataType type, const int64_t* new_dims,
+                                    int num_new_dims) {
  tensorflow::TensorShape s;
  for (int i = 0; i < num_new_dims; ++i) {
    s.AddDim(new_dims[i]);
  }
-  Status cc_status(to->tensor.BitcastFrom(
-      from->tensor, static_cast<tensorflow::DataType>(type), s));
-  Set_TF_Status_from_Status(status, cc_status);
+  return tensor_.BitcastFrom(from.tensor_,
+                             static_cast<tensorflow::DataType>(type), s);
 }

+}  // namespace tensorflow
+
 // --------------------------------------------------------------------------
+void StringEncode(const char* src, size_t src_len, char* dst) {
+  dst = tensorflow::core::EncodeVarint64(dst, src_len);
+  memcpy(dst, src, src_len);
+}
+
 size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
                       size_t dst_len, TF_Status* status) {
  const size_t sz = TF_StringEncodedSize(src_len);
@ -218,8 +221,7 @@ size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
                        src_len, "-byte string"));
    return 0;
  }
-  dst = tensorflow::core::EncodeVarint64(dst, src_len);
-  memcpy(dst, src, src_len);
+  StringEncode(src, src_len, dst);
  return sz;
 }

@ -278,13 +280,11 @@ static TF_Tensor* EmptyTensor(TF_DataType dtype,
 namespace tensorflow {

 // Non-static for testing.
-TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
-                               TF_Status* status) {
-  TF_SetStatus(status, TF_OK, "");
+TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status) {
+  *status = tensorflow::Status::OK();
  if (!src.IsInitialized()) {
-    Set_TF_Status_from_Status(
-        status, FailedPrecondition(
-                    "attempt to use a tensor with an uninitialized value"));
+    *status = FailedPrecondition(
+        "attempt to use a tensor with an uninitialized value");
    return nullptr;
  }
  if (src.NumElements() == 0) {
@ -292,14 +292,13 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
  }
  if (src.dtype() == tensorflow::DT_RESOURCE) {
    if (src.shape().dims() != 0) {
-      Set_TF_Status_from_Status(
-          status, InvalidArgument(
-                      "Unexpected non-scalar DT_RESOURCE tensor seen (shape: ",
-                      src.shape().DebugString(),
-                      "). Please file a bug at "
-                      "https://github.com/tensorflow/tensorflow/issues/new, "
-                      "ideally with a "
-                      "short code snippet that reproduces this error."));
+      *status = InvalidArgument(
+          "Unexpected non-scalar DT_RESOURCE tensor seen (shape: ",
+          src.shape().DebugString(),
+          "). Please file a bug at "
+          "https://github.com/tensorflow/tensorflow/issues/new, "
+          "ideally with a "
+          "short code snippet that reproduces this error.");
      return nullptr;
    }
    const string str =
@ -338,23 +337,15 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
    *offsets = (dst - data_start);
    offsets++;
    const string& s = srcarray(i);
-    size_t consumed = TF_StringEncode(s.data(), s.size(), dst, dst_len, status);
-    if (TF_GetCode(status) != TF_OK) {
-      Set_TF_Status_from_Status(
-          status,
-          InvalidArgument("invalid string tensor encoding (string #", i, " of ",
-                          srcarray.size(), "): ", TF_Message(status)));
-      delete[] base;
-      return nullptr;
-    }
+    const size_t consumed = TF_StringEncodedSize(s.size());
+    StringEncode(s.data(), s.size(), dst);
    dst += consumed;
    dst_len -= consumed;
  }
  if (dst != base + size) {
-    Set_TF_Status_from_Status(
-        status, InvalidArgument(
-                    "invalid string tensor encoding (decoded ", (dst - base),
-                    " bytes, but the tensor is encoded in ", size, " bytes"));
+    *status = InvalidArgument(
+        "invalid string tensor encoding (decoded ", (dst - base),
+        " bytes, but the tensor is encoded in ", size, " bytes");
    delete[] base;
    return nullptr;
  }
@ -372,31 +363,34 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
 }

 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
-  if (src->tensor.dtype() == DT_RESOURCE) {
-    if (src->tensor.dims() != 0) {
+  return src->tensor.ToTensor(dst);
+}
+
+Status TensorInterface::ToTensor(Tensor* dst) const {
+  if (tensor_.dtype() == DT_RESOURCE) {
+    if (tensor_.dims() != 0) {
      return InvalidArgument(
          "Malformed TF_RESOURCE tensor: expected a scalar, got a tensor with "
          "shape ",
-          src->tensor.shape().DebugString());
+          tensor_.shape().DebugString());
    }
-    *dst = Tensor(tensorflow::DT_RESOURCE, src->tensor.shape());
+    *dst = Tensor(tensorflow::DT_RESOURCE, tensor_.shape());
    if (!dst->scalar<tensorflow::ResourceHandle>()().ParseFromString(
-            string(static_cast<const char*>(TF_TensorData(src)),
-                   TF_TensorByteSize(src)))) {
+            string(static_cast<const char*>(Data()), ByteSize()))) {
      return InvalidArgument(
          "Malformed TF_RESOUCE tensor: unable to parse resource handle");
    }
    return Status::OK();
  }
-  if (src->tensor.dtype() != DT_STRING) {
-    *dst = src->tensor;
+  if (tensor_.dtype() != DT_STRING) {
+    *dst = tensor_;
    return Status::OK();
  }
  // TF_STRING tensors require copying since Tensor class expects a sequence of
  // string objects.
-  const tensorflow::int64 num_elements = src->tensor.NumElements();
-  const char* input = reinterpret_cast<const char*>(TF_TensorData(src));
-  const size_t src_size = TF_TensorByteSize(src);
+  const tensorflow::int64 num_elements = tensor_.NumElements();
+  const char* input = reinterpret_cast<const char*>(Data());
+  const size_t src_size = ByteSize();
  if (static_cast<tensorflow::int64>(src_size / sizeof(tensorflow::uint64)) <
      num_elements) {
    return InvalidArgument(
@ -405,7 +399,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
  const char* data_start = input + sizeof(tensorflow::uint64) * num_elements;
  const char* limit = input + src_size;

-  *dst = Tensor(src->tensor.dtype(), src->tensor.shape());
+  *dst = Tensor(tensor_.dtype(), tensor_.shape());
  auto dstarray = dst->flat<tstring>();
  for (tensorflow::int64 i = 0; i < num_elements; ++i) {
    tensorflow::uint64 offset =
@ -424,8 +418,12 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
  return Status::OK();
 }

+bool TensorInterface::CopyFrom(const Tensor& other, const TensorShape& shape) {
+  return tensor_.CopyFrom(other, shape);
+}
+
+bool TensorInterface::IsAligned() const { return tensor_.IsAligned(); }
+
 }  // namespace tensorflow

-bool TF_TensorIsAligned(const TF_Tensor* tensor) {
-  return tensor->tensor.IsAligned();
-}
+bool TF_TensorIsAligned(const TF_Tensor* t) { return t->tensor.IsAligned(); }
--- a/tensorflow/c/tf_tensor_internal.h
+++ b/tensorflow/c/tf_tensor_internal.h
@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_C_TF_TENSOR_INTERNAL_H_

 #include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_interface.h"
 #include "tensorflow/core/framework/tensor_shape.h"

 // Internal structures used by the C API. These are likely to change and should
@ -27,9 +29,41 @@ limitations under the License.
 // passed to or returned from C functions *by pointer*. Otherwise, changes to
 // its internal structure will break the C API's binary interface.
 typedef struct TF_Tensor {
-  ::tensorflow::Tensor tensor;
+  tensorflow::TensorInterface tensor;
 } TF_Tensor;

+class TF_ManagedBuffer : public tensorflow::TensorBuffer {
+ public:
+  TF_ManagedBuffer(void* data, size_t len,
+                   void (*deallocator)(void* data, size_t len, void* arg),
+                   void* deallocator_arg)
+      : TensorBuffer(data),
+        len_(len),
+        deallocator_(deallocator),
+        deallocator_arg_(deallocator_arg) {}
+
+  ~TF_ManagedBuffer() override {
+    (*deallocator_)(data(), len_, deallocator_arg_);
+  }
+
+  size_t size() const override { return len_; }
+  TensorBuffer* root_buffer() override { return this; }
+  void FillAllocationDescription(
+      tensorflow::AllocationDescription* proto) const override {
+    tensorflow::int64 rb = size();
+    proto->set_requested_bytes(rb);
+    proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
+  }
+
+  // Prevents input forwarding from mutating this buffer.
+  bool OwnsMemory() const override { return false; }
+
+ private:
+  const size_t len_;
+  void (*const deallocator_)(void* data, size_t len, void* arg);
+  void* const deallocator_arg_;
+};
+
 namespace tensorflow {

 class TensorCApi {
@ -50,4 +84,5 @@ void* allocate_tensor(const char* operation, size_t len, Allocator* allocator);
 // a different Allocator as `arg`.
 void deallocate_buffer(void* data, size_t len, void* arg);
 }  // namespace tensorflow
+
 #endif  // TENSORFLOW_C_TF_TENSOR_INTERNAL_H_
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@ -233,6 +233,7 @@ cc_library_with_android_deps(
    deps = [
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_experimental",
        "//tensorflow/core:protos_all_cc",
    ],
 )
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@ -127,6 +127,33 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs,
                               target_node_names, outputs, run_metadata);
 }

+Status ClientSession::Run(
+    const RunOptions& run_options, const FeedType& inputs,
+    const std::vector<Output>& fetch_outputs,
+    const std::vector<Operation>& run_outputs, std::vector<Tensor>* outputs,
+    RunMetadata* run_metadata,
+    const thread::ThreadPoolOptions& threadpool_options) const {
+  std::vector<std::pair<string, Tensor>> feeds;
+  for (auto const& feed : inputs) {
+    TF_RETURN_IF_ERROR(feed.second.status);
+    feeds.emplace_back(feed.first.name(), feed.second.tensor);
+  }
+  std::vector<string> output_tensor_names;
+  output_tensor_names.reserve(fetch_outputs.size());
+  for (auto const& output : fetch_outputs) {
+    output_tensor_names.push_back(output.name());
+  }
+  std::vector<string> target_node_names;
+  target_node_names.reserve(run_outputs.size());
+  for (auto const& output : run_outputs) {
+    target_node_names.push_back(output.node()->name());
+  }
+  TF_RETURN_IF_ERROR(impl()->MaybeExtendGraph());
+  return impl()->session_->Run(run_options, feeds, output_tensor_names,
+                               target_node_names, outputs, run_metadata,
+                               threadpool_options);
+}
+
 Status ClientSession::MakeCallable(const CallableOptions& callable_options,
                                   CallableHandle* out_handle) {
  TF_RETURN_IF_ERROR(impl()->MaybeExtendGraph());
--- a/tensorflow/cc/client/client_session.h
+++ b/tensorflow/cc/client/client_session.h
@ -93,6 +93,14 @@ class ClientSession {
             const std::vector<Operation>& run_outputs,
             std::vector<Tensor>* outputs, RunMetadata* run_metadata) const;

+  /// Same as above. Additionally allows user to provide custom threadpool
+  /// implementation via ThreadPoolOptions.
+  Status Run(const RunOptions& run_options, const FeedType& inputs,
+             const std::vector<Output>& fetch_outputs,
+             const std::vector<Operation>& run_outputs,
+             std::vector<Tensor>* outputs, RunMetadata* run_metadata,
+             const thread::ThreadPoolOptions& threadpool_options) const;
+
  /// \brief A handle to a subgraph, created with
  /// `ClientSession::MakeCallable()`.
  typedef int64 CallableHandle;
--- a/tensorflow/cc/client/client_session_test.cc
+++ b/tensorflow/cc/client/client_session_test.cc
@ -112,7 +112,7 @@ TEST(ClientSessionTest, Extend) {
  test::ExpectTensorEqual<int>(outputs[0], test::AsTensor<int>({31, 42}, {2}));
 }

-TEST(ClientSessionTest, MultiThreaded) {
+TEST(ClientSessionTest, MultiThreadedWithDefaultThreadpool) {
  Scope root = Scope::NewRootScope();
  auto a = Add(root, {1, 2}, {3, 4});
  auto b = Mul(root, {1, 2}, {3, 4});
@ -138,6 +138,49 @@ TEST(ClientSessionTest, MultiThreaded) {
  test::ExpectTensorEqual<int>(outputs[0], test::AsTensor<int>({-1, 2}, {2}));
 }

+TEST(ClientSessionTest, MultiThreadedWithCustomThreadpool) {
+  Scope root = Scope::NewRootScope();
+  int num_threads = 3;
+  auto a = Add(root, {1, 2}, {3, 4});
+  auto b = Mul(root, {1, 2}, {3, 4});
+  ClientSession session(root);
+
+  auto inter_op_threadpool =
+      absl::make_unique<CustomThreadPoolImpl>(num_threads);
+  ASSERT_EQ(inter_op_threadpool->GetNumScheduleCalled(), 0);
+
+  auto intra_op_threadpool =
+      absl::make_unique<CustomThreadPoolImpl>(num_threads);
+  ASSERT_EQ(intra_op_threadpool->GetNumScheduleCalled(), 0);
+
+  tensorflow::thread::ThreadPoolOptions threadPoolOptions;
+  threadPoolOptions.inter_op_threadpool = inter_op_threadpool.get();
+  threadPoolOptions.intra_op_threadpool = intra_op_threadpool.get();
+
+  {
+    thread::ThreadPool thread_pool(Env::Default(), "pool", 2);
+    thread_pool.Schedule([&session, a]() {
+      std::vector<Tensor> outputs;
+      TF_EXPECT_OK(session.Run(RunOptions(), ClientSession::FeedType{}, {a}, {},
+                               &outputs, nullptr, thread::ThreadPoolOptions()));
+      test::ExpectTensorEqual<int>(outputs[0],
+                                   test::AsTensor<int>({4, 6}, {2}));
+    });
+    thread_pool.Schedule([&session, b]() {
+      std::vector<Tensor> outputs;
+      TF_EXPECT_OK(session.Run(RunOptions(), ClientSession::FeedType{}, {b}, {},
+                               &outputs, nullptr, thread::ThreadPoolOptions()));
+      test::ExpectTensorEqual<int>(outputs[0],
+                                   test::AsTensor<int>({3, 8}, {2}));
+    });
+  }
+  auto c = Sub(root, b, a);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run(RunOptions(), ClientSession::FeedType{}, {c}, {},
+                           &outputs, nullptr, thread::ThreadPoolOptions()));
+  test::ExpectTensorEqual<int>(outputs[0], test::AsTensor<int>({-1, 2}, {2}));
+}
+
 TEST(ClientSessionTest, CallableWithDefaultThreadPool) {
  Scope root = Scope::NewRootScope();
  auto a = Placeholder(root, DT_INT32);
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@ -329,7 +329,7 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
      new std::unordered_map<StringPiece, std::pair<const char*, bool>,
                             StringPieceHasher>{
          {"string", {"StringPiece", false}},
-          {"list(string)", {"gtl::ArraySlice<string>", true}},
+          {"list(string)", {"gtl::ArraySlice<::tensorflow::tstring>", true}},
          {"int", {"int64", false}},
          {"list(int)", {"gtl::ArraySlice<int>", true}},
          {"float", {"float", false}},
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#define _USE_MATH_DEFINES
 #include <cmath>

 #include "tensorflow/cc/ops/array_ops_internal.h"
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@ -259,6 +259,9 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
  RunTest(x, x_init_value, y, y_shape);
 }

+// TODO(rocm):
+// Re-enable this test once 3D pooling is supported on ROCm platform
+#ifndef TENSORFLOW_USE_ROCM
 TEST_F(NNGradTest, MaxPool3DGradHelper) {
  TensorShape x_shape({1, 3, 3, 3, 1});
  TensorShape y_shape({1, 1, 1, 1, 1});
@ -271,6 +274,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
  SetRandomValuesForMaxPooling<float>(&x_init_value);
  RunTest(x, x_init_value, y, y_shape);
 }
+#endif

 TEST_F(NNGradTest, AvgPoolGradHelper) {
  TensorShape x_shape({1, 2, 2, 1});
@ -283,6 +287,9 @@ TEST_F(NNGradTest, AvgPoolGradHelper) {
  RunTest(x, x_shape, y, y_shape);
 }

+// TODO(rocm):
+// Re-enable this test once 3D pooling is supported on ROCm platform
+#ifndef TENSORFLOW_USE_ROCM
 TEST_F(NNGradTest, AvgPool3DGradHelper) {
  TensorShape x_shape({1, 3, 3, 3, 1});
  TensorShape y_shape({1, 1, 1, 1, 1});
@ -293,6 +300,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
  auto y = AvgPool3D(scope_, x, ksize, strides, "SAME");
  RunTest(x, x_shape, y, y_shape);
 }
+#endif

 TEST_F(NNGradTest, LRN) {
  TensorShape x_shape({1, 1, 2, 1});
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@ -9,8 +9,8 @@ tf_cuda_cc_test(
    name = "profiler_test",
    srcs = ["profiler_test.cc"],
    tags = [
+        "no_gpu",  # b/77649654
        "no_rocm",  # stream level tracing not supported on ROCm
-        "nogpu",  # b/77649654
    ],
    deps = [
        ":profiler",
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@ -10,7 +10,7 @@ load(
    "tf_cc_test",
 )
 load(
-    "//tensorflow/core/platform:default/build_config_root.bzl",
+    "//tensorflow/core/platform:build_config_root.bzl",
    "if_static",
    "if_static_and_not_mobile",
 )
@ -118,6 +118,37 @@ cc_library(
    alwayslink = 1,
 )

+cc_library(
+    name = "bundle_v2",
+    srcs = ["bundle_v2.cc"],
+    hdrs = ["bundle_v2.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:strcat",
+        "//tensorflow/core/util/tensor_bundle",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "bundle_v2_test",
+    srcs = ["bundle_v2_test.cc"],
+    data = [
+        ":saved_model_half_plus_two",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":bundle_v2",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:test",
+    ],
+)
+
 tf_cc_test(
    name = "saved_model_bundle_test",
    srcs = ["saved_model_bundle_test.cc"],
@ -160,6 +191,26 @@ tf_cc_test(
    ],
 )

+# A subset of the TF2 saved models can be generated with this tool.
+py_binary(
+    name = "testdata/generate_saved_models",
+    srcs = ["testdata/generate_saved_models.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/module",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:save_options",
+        "@absl_py//absl:app",
+    ],
+)
+
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
    name = "saved_model_half_plus_two",
@ -169,5 +220,7 @@ filegroup(
        "testdata/half_plus_two/**",
        "testdata/half_plus_two_v2/**",
        "testdata/x_plus_y_v2_debuginfo/**",
+        "testdata/CyclicModule/**",
+        "testdata/VarsAndArithmeticObjectGraph/**",
    ]),
 )
--- a/tensorflow/cc/saved_model/bundle_v2.cc
+++ b/tensorflow/cc/saved_model/bundle_v2.cc
@ -0,0 +1,223 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/bundle_v2.h"
+
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/core/protobuf/trackable_object_graph.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status ReadSavedModelProto(const string& export_dir,
+                           SavedModel* saved_model_proto) {
+  LOG(INFO) << "Reading SavedModel from: " << export_dir;
+
+  const string saved_model_pb_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePb);
+  if (Env::Default()->FileExists(saved_model_pb_path).ok()) {
+    return ReadBinaryProto(Env::Default(), saved_model_pb_path,
+                           saved_model_proto);
+  }
+  const string saved_model_pbtxt_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
+  if (Env::Default()->FileExists(saved_model_pbtxt_path).ok()) {
+    return ReadTextProto(Env::Default(), saved_model_pbtxt_path,
+                         saved_model_proto);
+  }
+  return Status(error::Code::NOT_FOUND,
+                "Could not find SavedModel .pb or .pbtxt at supplied export "
+                "directory path: " +
+                    export_dir);
+}
+
+Status ReadSavedModelDebugInfoIfPresent(
+    const string& export_dir,
+    std::unique_ptr<GraphDebugInfo>* debug_info_proto) {
+  LOG(INFO) << "Reading SavedModel debug info (if present) from: "
+            << export_dir;
+
+  const string debug_info_pb_path =
+      io::JoinPath(export_dir, "debug", "saved_model_debug_info.pb");
+  if (Env::Default()->FileExists(debug_info_pb_path).ok()) {
+    GraphDebugInfo debug_info;
+    TF_RETURN_IF_ERROR(
+        ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
+    *debug_info_proto =
+        absl::make_unique<GraphDebugInfo>(std::move(debug_info));
+  }
+  return Status::OK();
+}
+
+Status ReadCheckpointObjectGraph(BundleReader* bundle_reader,
+                                 TrackableObjectGraph* object_graph) {
+  Tensor object_graph_tensor;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      bundle_reader->Lookup(kObjectGraphProtoKey, &object_graph_tensor),
+      "SavedModel checkpoint does not contain object graph.");
+  if (object_graph_tensor.dtype() != DT_STRING ||
+      object_graph_tensor.dims() != 0 ||
+      object_graph_tensor.NumElements() != 1) {
+    return Status(
+        error::Code::FAILED_PRECONDITION,
+        "SavedModel checkpoint object graph was not the correct type.");
+  }
+
+  const tstring* object_graph_string = reinterpret_cast<const tstring*>(
+      object_graph_tensor.tensor_data().data());
+  if (!object_graph->ParseFromString(*object_graph_string)) {
+    return Status(
+        error::Code::FAILED_PRECONDITION,
+        "SavedModel checkpoint object graph could not be deserialized.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SavedModelV2Bundle::Load(const std::string& export_dir,
+                                SavedModelV2Bundle* const bundle) {
+  SavedModel saved_model_proto;
+  TF_RETURN_IF_ERROR(ReadSavedModelProto(export_dir, &saved_model_proto));
+
+  // Load MetaGraphDef.
+  // In version 2 SavedModels, there is only one MetaGraphDef.
+  if (saved_model_proto.meta_graphs_size() != 1) {
+    return Status(
+        error::Code::INVALID_ARGUMENT,
+        strings::StrCat(
+            "SavedModelV2 should have exactly one MetaGraphDef but actually ",
+            "contains ", saved_model_proto.meta_graphs_size()));
+  }
+  bundle->meta_graph_def_ =
+      std::move(*saved_model_proto.mutable_meta_graphs(0));
+
+  // Load GraphDebugInfo.
+  TF_RETURN_IF_ERROR(
+      ReadSavedModelDebugInfoIfPresent(export_dir, &bundle->debug_info_));
+
+  // Load the variables checkpoint reader.
+  const std::string variables_prefix = io::JoinPath(
+      export_dir, kSavedModelVariablesDirectory, kSavedModelVariablesFilename);
+  bundle->variable_reader_.reset(
+      new BundleReader(Env::Default(), variables_prefix));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      bundle->variable_reader_->status(),
+      "Unable to load SavedModel variables checkpoint from ", variables_prefix);
+
+  // Deserialize the object graph proto from the tensor bundle.
+  TF_RETURN_IF_ERROR(ReadCheckpointObjectGraph(
+      bundle->variable_reader_.get(), &bundle->trackable_object_graph_));
+  return Status::OK();
+}
+
+Status SavedModelV2Bundle::VisitObjectsToRestore(
+    RestoreObjectsCallback callback) {
+  if (saved_object_graph().nodes_size() == 0 ||
+      trackable_object_graph().nodes_size() == 0) {
+    return Status::OK();
+  }
+
+  // Start from root nodes of both the SavedObjectGraph and TrackableObjectGraph
+  // and descend to leaves. Note that the TrackableObjectGraph can have cycles
+  // (as can the SavedObjectGraph).
+  // This is detected and cycle edges are skipped.
+  const SavedObject* root_saved_object = &saved_object_graph().nodes(0);
+  const TrackableObjectGraph::TrackableObject* root_trackable_object =
+      &trackable_object_graph().nodes(0);
+  absl::flat_hash_set<int> trackable_node_ids;
+  return RecurseObjectsToRestore(root_saved_object, 0, root_trackable_object,
+                                 std::string(), &trackable_node_ids,
+                                 std::move(callback));
+}
+
+Status SavedModelV2Bundle::RecurseObjectsToRestore(
+    const SavedObject* saved_object, int saved_object_node_id,
+    const TrackableObjectGraph::TrackableObject* trackable_object,
+    std::string object_name, absl::flat_hash_set<int>* seen_trackable_node_ids,
+    RestoreObjectsCallback callback) {
+  // Callback if any attributes or slot variables.
+  // Note that the root is always excluded from the search (it can never
+  // be a restorable object). This matches some logic on the Python side.
+  if (saved_object_node_id != 0 &&
+      (trackable_object->attributes_size() > 0 ||
+       trackable_object->slot_variables_size() > 0)) {
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        callback(saved_object_node_id, *trackable_object), "Unable to restore ",
+        object_name);
+  }
+
+  for (const auto& trackable_child_ref : trackable_object->children()) {
+    const auto& local_name = trackable_child_ref.local_name();
+
+    // Compute the full child name.
+    std::string child_name;
+    if (object_name.empty()) {
+      child_name = local_name;
+    } else {
+      child_name = strings::StrCat(object_name, ".", local_name);
+    }
+
+    // Descend down the trackable graph.
+    int trackable_child_node_id = trackable_child_ref.node_id();
+    if (!seen_trackable_node_ids->insert(trackable_child_node_id).second) {
+      // Cycle or duplicate detected - ignore this branch.
+      continue;
+    }
+    if (trackable_child_node_id < 0 ||
+        trackable_child_node_id >= trackable_object_graph().nodes_size()) {
+      return Status(
+          errors::Code::FAILED_PRECONDITION,
+          strings::StrCat("Illegal trackable child node id for ", child_name));
+    }
+    const auto* trackable_child =
+        &trackable_object_graph().nodes(trackable_child_node_id);
+
+    // Descend down the saved object graph.
+    int saved_child_node_id = -1;
+    const SavedObject* saved_child = nullptr;
+    for (const auto& saved_child_ref : saved_object->children()) {
+      if (saved_child_ref.local_name() == local_name) {
+        // Found.
+        saved_child_node_id = saved_child_ref.node_id();
+        if (saved_child_node_id >= 0 &&
+            saved_child_node_id < saved_object_graph().nodes_size()) {
+          saved_child = &saved_object_graph().nodes(saved_child_node_id);
+        }
+        break;
+      }
+    }
+
+    if (!saved_child) {
+      return Status(
+          errors::Code::FAILED_PRECONDITION,
+          strings::StrCat("Could not find saved object to restore for ",
+                          child_name));
+    }
+
+    TF_RETURN_IF_ERROR(RecurseObjectsToRestore(
+        saved_child, saved_child_node_id, trackable_child, child_name,
+        seen_trackable_node_ids, callback));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
--- a/tensorflow/cc/saved_model/bundle_v2.h
+++ b/tensorflow/cc/saved_model/bundle_v2.h
@ -0,0 +1,87 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for loading the persistent representation of a SavedModelV2.
+// Please note that this is depended on by code that does not make use of
+// the full runtime and its dependencies should be restricted.
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_BUNDLE_V2_H_
+#define TENSORFLOW_CC_SAVED_MODEL_BUNDLE_V2_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/trackable_object_graph.pb.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+
+/// Represents a version 2 SavedModel that is loaded from storage (but not yet
+/// loaded into an executable in-memory representation).
+class SavedModelV2Bundle {
+ public:
+  using RestoreObjectsCallback =
+      std::function<Status(int, const TrackableObjectGraph::TrackableObject&)>;
+
+  /// Loads persistent representations for a SavedModelV2 from the specified
+  /// export directory.
+  static Status Load(const std::string& export_dir, SavedModelV2Bundle* bundle);
+
+  /// MetaGraphDef from the loaded SavedModel.
+  MetaGraphDef& meta_graph_def() { return meta_graph_def_; }
+
+  /// SavedObjectGraph from the MetaGraphDef.
+  const SavedObjectGraph& saved_object_graph() {
+    return meta_graph_def().object_graph_def();
+  }
+
+  /// TrackableObjectGraph loaded from the variable_reader() checkpoint.
+  TrackableObjectGraph& trackable_object_graph() {
+    return trackable_object_graph_;
+  }
+
+  /// BundleReader for accessing the variables bundle.
+  BundleReader* variable_reader() { return variable_reader_.get(); }
+
+  /// The GraphDebugInfo (or nullptr if none).
+  GraphDebugInfo* debug_info() { return debug_info_.get(); }
+
+  /// Restores objects, invoking the callback with the node id in the
+  /// saved_object_graph() and the corresponding TrackableObject from the
+  /// trackable_object_graph(). The callback may use the variable_reader() but
+  /// must not modify the underlying saved_object_graph().
+  Status VisitObjectsToRestore(RestoreObjectsCallback callback);
+
+ private:
+  Status RecurseObjectsToRestore(
+      const SavedObject* saved_object, int saved_object_node_id,
+      const TrackableObjectGraph::TrackableObject* trackable_object,
+      std::string object_name,
+      absl::flat_hash_set<int>* seen_trackable_node_ids,
+      RestoreObjectsCallback callback);
+
+  MetaGraphDef meta_graph_def_;
+  TrackableObjectGraph trackable_object_graph_;
+  std::unique_ptr<BundleReader> variable_reader_;
+  std::unique_ptr<GraphDebugInfo> debug_info_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_BUNDLE_V2_H_
--- a/tensorflow/cc/saved_model/bundle_v2_test.cc
+++ b/tensorflow/cc/saved_model/bundle_v2_test.cc
@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/bundle_v2.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr char kTestData[] = "cc/saved_model/testdata";
+
+class BundleV2Test : public ::testing::Test {
+ protected:
+  BundleV2Test() {}
+
+  void RestoreVarsAndVerify(SavedModelV2Bundle* bundle,
+                            std::vector<std::string> expected_names) {
+    // Collect saved_node_id, full_name, checkpoint_key into a vector.
+    using RestoredVarType = std::tuple<int, std::string, std::string>;
+    std::vector<RestoredVarType> restored_vars;
+    TF_ASSERT_OK(bundle->VisitObjectsToRestore(
+        [&](int saved_node_id,
+            const TrackableObjectGraph::TrackableObject& trackable_object)
+            -> Status {
+          for (const auto& attr : trackable_object.attributes()) {
+            if (attr.name() == "VARIABLE_VALUE") {
+              restored_vars.emplace_back(saved_node_id, attr.full_name(),
+                                         attr.checkpoint_key());
+            }
+          }
+          return Status::OK();
+        }));
+
+    // Should be one of each var name restored.
+    for (const auto& expected_name : expected_names) {
+      EXPECT_EQ(1, std::count_if(restored_vars.begin(), restored_vars.end(),
+                                 [&](RestoredVarType t) {
+                                   return std::get<1>(t) == expected_name;
+                                 }));
+    }
+
+    for (const auto& restored_var : restored_vars) {
+      // Each restored var should match a SavedObjectGraph node with the same
+      // variable name.
+      const auto& saved_node =
+          bundle->saved_object_graph().nodes(std::get<0>(restored_var));
+      EXPECT_EQ(std::get<1>(restored_var), saved_node.variable().name());
+
+      // And should be able to load it from the tensor_bundle.
+      Tensor value;
+      TF_ASSERT_OK(
+          bundle->variable_reader()->Lookup(std::get<2>(restored_var), &value));
+    }
+  }
+};
+
+TEST_F(BundleV2Test, LoadsVarsAndArithmeticObjectGraph) {
+  const string export_dir = io::JoinPath(
+      testing::TensorFlowSrcRoot(), kTestData, "VarsAndArithmeticObjectGraph");
+
+  SavedModelV2Bundle bundle;
+  TF_ASSERT_OK(SavedModelV2Bundle::Load(export_dir, &bundle));
+
+  // Ensure that there are nodes in the trackable_object_graph.
+  EXPECT_GT(bundle.trackable_object_graph().nodes_size(), 0);
+
+  RestoreVarsAndVerify(&bundle, {"variable_x", "variable_y", "child_variable"});
+}
+
+TEST_F(BundleV2Test, LoadsCyclicModule) {
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestData, "CyclicModule");
+
+  SavedModelV2Bundle bundle;
+  TF_ASSERT_OK(SavedModelV2Bundle::Load(export_dir, &bundle));
+
+  // Ensure that there are nodes in the trackable_object_graph.
+  EXPECT_GT(bundle.trackable_object_graph().nodes_size(), 0);
+
+  RestoreVarsAndVerify(&bundle, {"MyVariable"});
+}
+
+}  // namespace
+}  // namespace tensorflow
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@ -50,6 +50,9 @@ constexpr char kSavedModelVariablesFilename[] = "variables";
 constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
 constexpr char kSavedModelTrainOpSignatureKey[] = "__saved_model_train_op";

+// Key in the TensorBundle for the object graph proto.
+constexpr char kObjectGraphProtoKey[] = "_CHECKPOINTABLE_OBJECT_GRAPH";
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
--- a/tensorflow/cc/saved_model/python/BUILD
+++ b/tensorflow/cc/saved_model/python/BUILD
@ -1,7 +1,7 @@
 # Description:
 # CLIF wrappers for TensorFlow SavedModels.

-load("//tensorflow/core/platform:default/build_config.bzl", "tf_py_clif_cc")
+load("//tensorflow/core/platform:build_config.bzl", "tf_py_clif_cc")

 package(
    default_visibility = ["//visibility:public"],
--- a/tensorflow/cc/saved_model/testdata/CyclicModule/debug/saved_model_debug_info.pb
+++ b/tensorflow/cc/saved_model/testdata/CyclicModule/debug/saved_model_debug_info.pb
@ -0,0 +1,235 @@
+
+N/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/absl/app.py
+m/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/saved_model/save.py
+generate_saved_models.pyb
+:AssignVariableOp/MyVariable@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GW
+/AssignVariableOp@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GO
+'Identity@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GL
+$Identity@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GQ
+)Identity_1@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GN
+&Identity_1@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GQ
+)Identity_2@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+Gj
+BMergeV2Checkpoints/checkpoint_prefixes@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GV
+.MergeV2Checkpoints@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GK
+#NoOp@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+Ga
+9RestoreV2/shape_and_slices@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+G]
+5RestoreV2/tensor_names@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GP
+(RestoreV2@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+Gc
+;RestoreV2_1/shape_and_slices@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+G_
+7RestoreV2_1/tensor_names@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GR
+*RestoreV2_1@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+Gi
+ASaveV2/MyVariable/Read/ReadVariableOp@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+G[
+3SaveV2/shape_and_slices@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GW
+/SaveV2/tensor_names@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GJ
+"SaveV2@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GR
+*SaveV2_1/Const@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+G]
+5SaveV2_1/shape_and_slices@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GY
+1SaveV2_1/tensor_names@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GL
+$SaveV2_1@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GY
+1ShardedFilename/shard@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GS
+ShardedFilename@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+G[
+3ShardedFilename_1/shard@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GU
+-ShardedFilename_1@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GW
+/StringJoin/inputs_1@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GN
+&StringJoin@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GR
+*file_prefix@__inference__traced_restore_45$
+Ö
+•
+A
+ú
+«
+GO
+'file_prefix@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+GN
+&num_shards@__inference__traced_save_30$
+Ö
+•
+A
+ú
+«
+G
--- a/tensorflow/cc/saved_model/testdata/CyclicModule/saved_model.pb
+++ b/tensorflow/cc/saved_model/testdata/CyclicModule/saved_model.pb
--- a/tensorflow/cc/saved_model/testdata/CyclicModule/variables/variables.data-00000-of-00001
+++ b/tensorflow/cc/saved_model/testdata/CyclicModule/variables/variables.data-00000-of-00001
--- a/tensorflow/cc/saved_model/testdata/CyclicModule/variables/variables.index
+++ b/tensorflow/cc/saved_model/testdata/CyclicModule/variables/variables.index
--- a/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/debug/saved_model_debug_info.pb
+++ b/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/debug/saved_model_debug_info.pb
@ -0,0 +1,506 @@
+
+j/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py
+m/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/saved_model/save.py
+€/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/saved_model/signature_serialization.py
+i/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py
+N/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/absl/app.py
+o/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py
+./generate_saved_models.py
+k/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py
+q/usr/local/google/home/laurenzo/.local/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.pyg
+;AssignVariableOp/variable_x@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6\
+0AssignVariableOp@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6i
+=AssignVariableOp_1/variable_y@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6^
+2AssignVariableOp_1@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6m
+AAssignVariableOp_2/child_variable@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6^
+2AssignVariableOp_2@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6T
+(Identity@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6P
+$Identity@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6J
+Identity@__inference_compute_34'
+T
+þ
+0
+ú
+«
+6U
+)Identity@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6V
+*Identity_1@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6R
+&Identity_1@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6V
+*Identity_2@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6V
+*Identity_3@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6V
+*Identity_4@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6n
+BMergeV2Checkpoints/checkpoint_prefixes@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6Z
+.MergeV2Checkpoints@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6P
+$NoOp@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6f
+:RestoreV2/shape_and_slices@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6b
+6RestoreV2/tensor_names@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6U
+)RestoreV2@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6h
+<RestoreV2_1/shape_and_slices@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6d
+8RestoreV2_1/tensor_names@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6W
+RestoreV2_1@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6q
+ESaveV2/child_variable/Read/ReadVariableOp@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6_
+3SaveV2/shape_and_slices@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6[
+/SaveV2/tensor_names@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6m
+ASaveV2/variable_x/Read/ReadVariableOp@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6m
+ASaveV2/variable_y/Read/ReadVariableOp@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6N
+"SaveV2@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6X
+,SaveV2_1/Const_1@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6a
+5SaveV2_1/shape_and_slices@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6]
+1SaveV2_1/tensor_names@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6P
+$SaveV2_1@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6]
+1ShardedFilename/shard@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6W
+ShardedFilename@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6_
+3ShardedFilename_1/shard@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6Y
+-ShardedFilename_1@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6k
+?StatefulPartitionedCall/args_2@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6k
+?StatefulPartitionedCall/args_3@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6k
+?StatefulPartitionedCall/args_4@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6k
+?StatefulPartitionedCall/args_5@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6d
+8StatefulPartitionedCall@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6[
+/StringJoin/inputs_1@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6R
+&StringJoin@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6C
+a@__inference_compute_34'
+T
+þ
+0
+ú
+«
+6N
+"a@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6y
+2add/ReadVariableOp/resource@__inference_compute_34C
+è	
+‰
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿p
+)add/ReadVariableOp@__inference_compute_34C
+è	
+‰
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿c
+add@__inference_compute_34E
+©	
+’
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿{
+4add_1/ReadVariableOp/resource@__inference_compute_34C
+è	
+‰
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿r
+add_1/ReadVariableOp@__inference_compute_34C
+è	
+‰
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿e
+add_1@__inference_compute_34E
+©	
+’
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿g
+add_2/y@__inference_compute_34E
+©	
+…
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿e
+add_2@__inference_compute_34E
+©	
+…
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿C
+b@__inference_compute_34'
+T
+þ
+0
+ú
+«
+6N
+"b@__inference_signature_wrapper_45(
+ƒ
+€
+0
+ú
+«
+6W
+file_prefix@__inference__traced_restore_101(
+Ö
+•
+0
+ú
+«
+6S
+'file_prefix@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6c
+mul@__inference_compute_34E
+°	
+…
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿R
+&num_shards@__inference__traced_save_80(
+Ö
+•
+0
+ú
+«
+6}
+6truediv/ReadVariableOp/resource@__inference_compute_34C
+è	
+‰
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿t
+-truediv/ReadVariableOp@__inference_compute_34C
+è	
+‰
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿g
+truediv@__inference_compute_34E
+ï
+’
+
+Ä
+ð
+·
+Ò
+†
+ô
+¿
--- a/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/saved_model.pb
+++ b/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/saved_model.pb
--- a/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/variables/variables.data-00000-of-00001
+++ b/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/variables/variables.data-00000-of-00001
--- a/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/variables/variables.index
+++ b/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/variables/variables.index
--- a/tensorflow/cc/saved_model/testdata/generate_saved_models.py
+++ b/tensorflow/cc/saved_model/testdata/generate_saved_models.py
@ -0,0 +1,97 @@
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Standalone utility to generate some test saved models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import app
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.module import module
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import save_options
+from tensorflow.python.saved_model import saved_model
+
+
+class VarsAndArithmeticObjectGraph(module.Module):
+  """Three vars (one in a sub-module) and compute method."""
+
+  def __init__(self):
+    self.x = variables.Variable(1.0, name="variable_x")
+    self.y = variables.Variable(2.0, name="variable_y")
+    self.child = module.Module()
+    self.child.z = variables.Variable(3.0, name="child_variable")
+    self.child.c = ops.convert_to_tensor(5.0)
+
+  @def_function.function(input_signature=[
+      tensor_spec.TensorSpec((), dtypes.float32),
+      tensor_spec.TensorSpec((), dtypes.float32)
+  ])
+  def compute(self, a, b):
+    return (a + self.x) * (b + self.y) / (self.child.z) + self.child.c
+
+
+class ReferencesParent(module.Module):
+
+  def __init__(self, parent):
+    super(ReferencesParent, self).__init__()
+    self.parent = parent
+    self.my_variable = variables.Variable(3., name="MyVariable")
+
+
+# Creates a cyclic object graph.
+class CyclicModule(module.Module):
+
+  def __init__(self):
+    super(CyclicModule, self).__init__()
+    self.child = ReferencesParent(self)
+
+
+MODULE_CTORS = {
+    "VarsAndArithmeticObjectGraph": VarsAndArithmeticObjectGraph,
+    "CyclicModule": CyclicModule,
+}
+
+
+def main(args):
+  if len(args) != 3:
+    print("Expected: {export_path} {ModuleName}")
+    print("Allowed ModuleNames:", MODULE_CTORS.keys())
+    return 1
+
+  _, export_path, module_name = args
+  module_ctor = MODULE_CTORS.get(module_name)
+  if not module_ctor:
+    print("Expected ModuleName to be one of:", MODULE_CTORS.keys())
+    return 2
+  os.makedirs(export_path)
+
+  tf_module = module_ctor()
+  options = save_options.SaveOptions(save_debug_info=True)
+  saved_model.save(tf_module, export_path, options=options)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  app.run(main)
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@ -75,8 +75,8 @@ tf_cc_test(
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
        "@com_google_absl//absl/strings",
-        "@llvm//:support",  # fixdeps: keep
-        "@llvm//:x86_code_gen",  # fixdeps: keep
+        "@llvm-project//llvm:support",  # fixdeps: keep
+        "@llvm-project//llvm:x86_code_gen",  # fixdeps: keep
    ],
 )

@ -104,11 +104,11 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "@com_google_absl//absl/strings",
-        "@llvm//:aarch64_code_gen",  # fixdeps: keep
-        "@llvm//:arm_code_gen",  # fixdeps: keep
-        "@llvm//:powerpc_code_gen",  # fixdeps: keep
-        "@llvm//:target",
-        "@llvm//:x86_code_gen",  # fixdeps: keep
+        "@llvm-project//llvm:aarch64_code_gen",  # fixdeps: keep
+        "@llvm-project//llvm:arm_code_gen",  # fixdeps: keep
+        "@llvm-project//llvm:powerpc_code_gen",  # fixdeps: keep
+        "@llvm-project//llvm:target",
+        "@llvm-project//llvm:x86_code_gen",  # fixdeps: keep
    ],
 )

@ -205,9 +205,9 @@ cc_library(
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:span",
-        "@llvm//:core",
-        "@llvm//:support",
-        "@llvm//:target",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:support",
+        "@llvm-project//llvm:target",
    ],
 )

--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@ -323,6 +323,42 @@ tf_library(
    ],
 )

+tf_library(
+    name = "test_graph_tfcond_mlir_bridge",
+    testonly = 1,
+    config = "test_graph_tfcond.config.pbtxt",
+    cpp_class = "CondComp",
+    graph = "test_graph_tfcond.pb",
+    mlir_components = "Bridge",
+    tags = [
+        "manual",
+    ],
+)
+
+tf_library(
+    name = "test_graph_tfassert_eq_mlir_bridge",
+    testonly = 1,
+    config = "test_graph_tfassert_eq.config.pbtxt",
+    cpp_class = "AssertComp",
+    graph = "test_graph_tfassert_eq.pb",
+    mlir_components = "Bridge",
+    tags = [
+        "manual",
+    ],
+)
+
+tf_library(
+    name = "test_graph_tfgather_mlir_bridge",
+    testonly = 1,
+    config = "test_graph_tfgather.config.pbtxt",
+    cpp_class = "GatherComp",
+    graph = "test_graph_tfgather.pb",
+    mlir_components = "Bridge",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_library(
    name = "test_graph_tfmatmul_mlir_bridge",
    testonly = 1,
@ -361,6 +397,30 @@ tf_library(
    ],
 )

+tf_library(
+    name = "test_graph_tfsplits_mlir_bridge",
+    testonly = 1,
+    config = "test_graph_tfsplits.config.pbtxt",
+    cpp_class = "SplitsComp",
+    graph = "test_graph_tfsplits.pb",
+    mlir_components = "Bridge",
+    tags = [
+        "manual",
+    ],
+)
+
+tf_library(
+    name = "test_graph_tftop_k_mlir_bridge",
+    testonly = 1,
+    config = "test_graph_tftop_k.config.pbtxt",
+    cpp_class = "TopKComp",
+    graph = "test_graph_tftop_k.pb",
+    mlir_components = "Bridge",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_cc_test(
    name = "tfcompile_test_mlir_bridge",
    srcs = ["tfcompile_test.cc"],
@ -372,9 +432,14 @@ tf_cc_test(
        ":test_graph_tfadd_mlir_bridge",
        ":test_graph_tfadd_with_ckpt_mlir_bridge",
        ":test_graph_tfadd_with_ckpt_saver_mlir_bridge",
+        ":test_graph_tfassert_eq_mlir_bridge",
+        ":test_graph_tfcond_mlir_bridge",
+        ":test_graph_tfgather_mlir_bridge",
        ":test_graph_tfmatmul_mlir_bridge",
        ":test_graph_tfmatmulandadd_mlir_bridge",
        ":test_graph_tfmatmulandadd_with_profiling_mlir_bridge",
+        ":test_graph_tfsplits_mlir_bridge",
+        ":test_graph_tftop_k_mlir_bridge",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:xla_data_proto_cc",
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@ -34,6 +34,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
@ -184,6 +185,7 @@ def write_graph(build_graph, out_dir):


 def main(_):
+  control_flow_util.enable_control_flow_v2()
  write_graph(tfadd, FLAGS.out_dir)
  write_graph(tfadd_with_ckpt, FLAGS.out_dir)
  write_graph(tfadd_with_ckpt_saver, FLAGS.out_dir)
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@ -30,9 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_mlir_bridge.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_mlir_bridge.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver_mlir_bridge.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfassert_eq_mlir_bridge.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfcond_mlir_bridge.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfgather_mlir_bridge.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul_mlir_bridge.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_mlir_bridge.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling_mlir_bridge.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfsplits_mlir_bridge.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tftop_k_mlir_bridge.h"
 #else
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
@ -167,8 +172,6 @@ TEST(TFCompileTest, AddWithCkptSaver) {
  EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
 }

-// TODO(bixia): the following tests failed with MLIR bridge.
-#if !defined(ENABLE_MLIR_BRIDGE_TEST)
 TEST(TFCompileTest, Cond) {
  CondComp cond;
  EXPECT_EQ(cond.arg0_data(), cond.arg_data(0));
@ -233,7 +236,6 @@ TEST(TFCompileTest, Gather) {
    EXPECT_EQ(gather_const.result0_data(), gather.results()[0]);
  }
 }
-#endif

 TEST(TFCompileTest, MatMul2) {
  Eigen::ThreadPool tp(2);
@ -439,6 +441,7 @@ TEST(TFCompileTest, Function) {
  EXPECT_EQ(add_fn.result0_data()[0], 3);
  EXPECT_EQ(add_fn.result0_data(), add_fn.results()[0]);
 }
+#endif

 TEST(TFCompileTest, Splits) {
  Eigen::ThreadPool tp(1);
@ -492,6 +495,8 @@ TEST(TFCompileTest, TopK) {
  EXPECT_EQ(expected_indices[1], fn.result1(1));
 }

+// TODO(bixia): the following tests failed with MLIR bridge.
+#if !defined(ENABLE_MLIR_BRIDGE_TEST)
 TEST(TFCompileTest, Variable) {
  Eigen::ThreadPool tp(1);
  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
@ -564,6 +569,7 @@ TEST(TFCompileTest, VariableSequentialUpdatesNoAlloc) {
  fn.Run();
  EXPECT_NEAR(x, 0.594322f, 1e-6);
 }
+#endif

 TEST(TFCompileTest, AssertEqAndReturnDiff) {
  // Assert is converted into a no-op in XLA, so there is no failure even if the
@ -665,6 +671,11 @@ TEST(TFCompileTest, HloProfiling) {
                           /*clock_rate_ghz=*/1.0);
  VLOG(1) << "Original HLO profile string:\n" << hlo_profile_as_string;

+  // Replace Arg_n with argn when the MLIR bridge is used.
+#if defined(ENABLE_MLIR_BRIDGE_TEST)
+  RE2::GlobalReplace(&hlo_profile_as_string, "(Arg_)([0-9].)", "arg\\2");
+#endif
+
  // Strip away identifier details from the profile string to avoid this test
  // being a change detector for xla internals. Identifiers such as '%dot.0.7'
  // just become '%dot'.
@ -690,7 +701,6 @@ TEST(TFCompileTest, HloProfiling) {
              IsSupersetOf({header, total_cycles_profile_line, dot_profile_line,
                            add_profile_line, tuple_profile_line}));
 }
-#endif

 }  // namespace
 }  // namespace tfcompile
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@ -39,7 +39,7 @@ def tf_library(
        enable_xla_hlo_profiling = False,
        mlir_components = None,
        deps = None,
-        tags = None):
+        tags = []):
    """Runs tfcompile to compile a TensorFlow graph into executable code.

    Given an invocation of tf_library(name="foo", ...), generates the following
@ -407,6 +407,7 @@ def target_llvm_triple():
        "//tensorflow:android_arm64": "aarch64-none-android",
        "//tensorflow:android_x86": "i686-none-android",
        "//tensorflow:ios": "arm64-none-ios",
+        "//tensorflow:ios_x86_64": "x86_64-apple-ios",
        "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
        "//tensorflow:macos": "x86_64-none-darwin",
        "//conditions:default": "x86_64-pc-linux",
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -1,15 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "tf_cc_test")
 load("//tensorflow/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
-load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
+load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")

 package(
-    default_visibility = [
-        ":internal",
-        # BEGIN-GOOGLE-INTERNAL
-        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
-        # END-GOOGLE-INTERNAL
-    ],
+    default_visibility = [":internal"],
    licenses = ["notice"],  # Apache 2.0
 )

@ -27,6 +22,17 @@ package_group(
    ],
 )

+# defs.cc/h only contains string constants, and can be included in mobile
+# builds.
+filegroup(
+    name = "mobile_srcs_no_runtime",
+    srcs = [
+        "defs.cc",
+        "defs.h",
+    ],
+    visibility = [":friends"],
+)
+
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
    name = "jit",
@ -71,6 +77,19 @@ cc_library(
    alwayslink = 1,
 )

+cc_library(
+    name = "xla_mlir_gpu_jit",
+    visibility = ["//visibility:public"],
+    deps = if_cuda_or_rocm([
+        ":jit_compilation_passes",
+        "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
+        "//tensorflow/compiler/xla/service:mlir_gpu_plugin",
+    ]),
+    alwayslink = 1,
+)
+
 cc_library(
    name = "xla_cpu_device",
    srcs = ["xla_cpu_device.cc"],
@ -96,6 +115,7 @@ cc_library(
    srcs = ["xla_gpu_device.cc"],
    visibility = [":friends"],
    deps = [
+        ":flags",
        ":jit_compilation_passes",
        ":xla_device",
        ":xla_kernel_creator",  # buildcleaner: keep
@ -231,7 +251,14 @@ cc_library(
    srcs = ["shape_inference_helpers.cc"],
    hdrs = ["shape_inference_helpers.h"],
    visibility = [":friends"],
-    deps = ["//tensorflow/core:graph"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:graph",
+        ],
+    }),
 )

 # Internal targets below this point.
@ -469,6 +496,7 @@ cc_library(
        "//tensorflow/core:framework",
        "//tensorflow/core:graph",
        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
    ],
 )

--- a/tensorflow/compiler/jit/build_xla_ops_pass.h
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.h
@ -22,8 +22,9 @@ limitations under the License.

 namespace tensorflow {

-// Adds _XlaCompile and _XlaRun operations to the TF graph that compiles and
-// executes (using XLA) TF function calls marked with "_XlaCompiledKernel".
+// Replaces TF function calls marked with `_XlaCompiledKernel` with _XlaCompile
+// and _XlaRun nodes (which compile and launch, respectively, the corresponding
+// HLO module).
 class BuildXlaOpsPass : public GraphOptimizationPass {
 public:
  // If enable_lazy_compilation is not nullopt then *enable_lazy_compilation
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@ -509,10 +509,10 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
  auto it = uncompilable_nodes->find(function_identifier);
  if (it == uncompilable_nodes->end()) {
    std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
-        uncompileable_node_info{std::move(node_info)};
+        uncompilable_node_info{std::move(node_info)};
    uncompilable_nodes->emplace(
        std::move(function_identifier),
-        std::make_pair(function, std::move(uncompileable_node_info)));
+        std::make_pair(function, std::move(uncompilable_node_info)));
  } else {
    it->second.second.emplace_back(std::move(node_info));
  }
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/hash/hash.h"

@ -96,7 +97,7 @@ limitations under the License.
 // Symbolic > NonSymbolic.  The lattice has height = 2 so two iterations are
 // sufficient to converge.
 //
-// We first do an optimisitc analysis and, if it does not converge, we then fall
+// We first do an optimistic analysis and, if it does not converge, we then fall
 // back to a pessimistic analysis.  The optimistic analysis assigns the same
 // symbolic predicate to all the merge nodes whose preceding enter nodes have
 // the same frame name on the first iteration.  On the second iteration, if all
@ -1255,7 +1256,7 @@ Status DeadnessAnalysisImpl::GetFrameBasedTopologicalOrder(
    } else if (IsRootExit(node)) {
      ++num_exits_for_frame[cf.frame_name];
    }
-    // Edge NextIteration->Merge is counted before starting the traveral to
+    // Edge NextIteration->Merge is counted before starting the traversal to
    // break the backedges.
    if (IsMerge(node)) {
      for (const Edge* e : node->in_edges()) {
@ -1458,11 +1459,11 @@ Status DeadnessAnalysisImpl::PopulateFrame(absl::Span<Node* const> topo,

  for (Node* n : topo) {
    // The nodes added to should_revisit in the previous loop need to be
-    // revisited now.  Reprocesing these initial nodes may add *their* consumers
-    // to should_revisit, and these newly added nodes will also be processed by
-    // this very same loop.  Since we're traversing the graph in topological
-    // order (producers before consumers) and HandleNode(n) can only ever add
-    // n's consumers to should_revisit, we won't "miss" an addition to
+    // revisited now.  Reprocessing these initial nodes may add *their*
+    // consumers to should_revisit, and these newly added nodes will also be
+    // processed by this very same loop.  Since we're traversing the graph in
+    // topological order (producers before consumers) and HandleNode(n) can only
+    // ever add n's consumers to should_revisit, we won't "miss" an addition to
    // should_revisit.
    if (should_revisit[n->id()]) {
      VLOG(4) << "Revisiting " << n->name();
--- a/tensorflow/compiler/jit/defs.cc
+++ b/tensorflow/compiler/jit/defs.cc
@ -17,6 +17,8 @@ limitations under the License.

 namespace tensorflow {

+const char* const kXlaMustCompileAttr = "_XlaMustCompile";
+
 const char* const kXlaCompileAttr = "_XlaCompile";

 // User-provided through jit_scope APIs. Effective only when auto_jit is OFF.
--- a/tensorflow/compiler/jit/defs.h
+++ b/tensorflow/compiler/jit/defs.h
@ -22,7 +22,16 @@ limitations under the License.
 namespace tensorflow {

 // Name of attribute used to tag operators for compilation with XLA
+
+// Implies must-compile semantics: either it will be compiled
+// with XLA, or an error will be thrown.
+extern const char* const kXlaMustCompileAttr;  // "_XlaMustCompile"
+
+// Implies auto-clustering: tagged nodes will be clustered and compiled with XLA
+// on a best-effort basis.
 extern const char* const kXlaCompileAttr;  // "_XlaCompile"
+
+// Implies auto-clustering within the given scope.
 extern const char* const kXlaScopeAttr;    // "_XlaScope"
 extern const char* const kXlaInternalScopeAttr;  // "_XlaInternalScope"

--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@ -27,6 +27,15 @@ limitations under the License.

 namespace tensorflow {

+// EncapsulateSubgraphs pass takes all the nodes with the same cluster ID
+// (derived from kXlaClusterAttr=ID (kXlaClusterAttr) attribute), puts them into
+// a TF function, and replaces the subgraph in the main graph with a call to
+// that TF function annotated with kXlaCompiledKernelAttr (_XlaCompiledKernel).
+class EncapsulateSubgraphsPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
 // A rewriting function to apply to each subgraph during encapsulation.
 // 'arg_source_tensors' are the tensors corresponding to the arguments in the
 // original source graph (*not* 'graph').
@ -95,16 +104,11 @@ extern const char* const kXlaNumResourceArgsAttr;
 extern const char* const kXlaHasReferenceVarsAttr;

 // Sorts each node's control inputs by their names. This guarantees that for two
-// structually equivalent GraphDefs, we get the same traversal ordering on
+// structurally equivalent GraphDefs, we get the same traversal ordering on
 // node's control input fields.
 // TODO(hpucha): Move the utilities to a more appropriate place.
 void SortControlInputs(GraphDef* gdef);

-class EncapsulateSubgraphsPass : public GraphOptimizationPass {
- public:
-  Status Run(const GraphOptimizationPassOptions& options) override;
-};
-
 }  // namespace tensorflow

 #endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_SUBGRAPHS_PASS_H_
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@ -72,7 +72,7 @@ extern const char kXlaLiftedArgOutsideCompilationAttrName[];

 // Attribute indicating that this is an IdentityN node receiving inputs for a
 // outside compilation Placeholder node (the original outside compilation node
-// is moved out of TPU comutation, and we left a Placeholder node there).
+// is moved out of TPU computation, and we left a Placeholder node there).
 // Attribute value will be a string, which is the outside compilation cluster
 // name for the outside compilation Placeholder node.
 extern const char kXlaOutsideCompilationInputsAttrName[];
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
@ -28,7 +28,7 @@
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/env.h"

-    namespace tensorflow {
+namespace tensorflow {

 // Encapsulates nodes marked with the _xla_compile_id attribute into
 // XlaLaunch operators.
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@ -2130,6 +2130,53 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
  return Status::OK();
 }

+Status CopyOutsideCompilationConstNodes(
+    Graph* g, const string& outside_compilation_attr_name) {
+  for (Node* n : g->op_nodes()) {
+    if (!n->IsConstant() ||
+        !HasNodeAttr(n->def(), outside_compilation_attr_name)) {
+      continue;
+    }
+
+    std::vector<const Edge*> out_edges(n->out_edges().begin(),
+                                       n->out_edges().end());
+    bool has_non_oc_output = false;
+    for (const Edge* e : out_edges) {
+      if (!e->IsControlEdge() &&
+          !HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) {
+        has_non_oc_output = true;
+        break;
+      }
+    }
+    if (!has_non_oc_output) {
+      continue;
+    }
+
+    NodeDef copy_def = n->def();
+    copy_def.set_name(g->NewName(n->name()));
+    copy_def.mutable_attr()->erase(outside_compilation_attr_name);
+    Status s;
+    Node* copy_node = g->AddNode(copy_def, &s);
+    TF_RETURN_IF_ERROR(s);
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) {
+        g->AddControlEdge(e->src(), copy_node);
+      }
+    }
+    for (const Edge* e : out_edges) {
+      if (!e->IsControlEdge() &&
+          !HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) {
+        Node* dst = e->dst();
+        int dst_input = e->dst_input();
+        g->RemoveEdge(e);
+        g->AddEdge(copy_node, 0, dst, dst_input);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace

 Status RewriteOutsideCompilationSubgraphFn::operator()(
@ -2279,6 +2326,10 @@ Status ExtractOutsideCompilationForFunction(
  std::vector<string> outside_compilation_host_graphs;
  std::vector<string> shape_inference_graphs_to_rewrite;
  if (*has_outside_compilation) {
+    // Copy outside compilation Const nodes with non outside compilation users.
+    TF_RETURN_IF_ERROR(CopyOutsideCompilationConstNodes(
+        fbody->graph, outside_compilation_attr_name));
+
    // Find dependencies between outside compilation clusters.
    TF_ASSIGN_OR_RETURN(auto cluster_deps,
                        OutsideCompilationClusterDependencies(
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@ -941,7 +941,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
  // "const0"
  // "identity0" = "const0" (outside compilation cluster "0")
  // "identity1" = "const0" "^identity0" (outside compilation cluster "1",
-  //                                      control depdent on cluster "0")
+  //                                      control dependent on cluster "0")
  // "identity2" = "identity1"
  FunctionDefLibrary fdl;
  {
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@ -48,6 +48,15 @@ bool SetterForXlaAutoJitFlag(const string& value) {
    return true;
  }

+  if (value == "fusible") {
+    mark_for_compilation_flags->xla_auto_jit_flag
+        .optimization_level_single_gpu = 1;
+    mark_for_compilation_flags->xla_auto_jit_flag.optimization_level_general =
+        1;
+    mark_for_compilation_flags->tf_xla_ops_to_cluster = "FUSIBLE";
+    return true;
+  }
+
  absl::string_view value_sv(value);
  if (!absl::ConsumePrefix(&value_sv, "single-gpu(") ||
      !absl::ConsumeSuffix(&value_sv, ")") ||
@ -65,7 +74,9 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
      Flag("tf_xla_auto_jit", SetterForXlaAutoJitFlag, "0",
           "Control compilation of operators into XLA computations on CPU and "
           "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-           "things very likely to be improved; 2 = on for everything.  "
+           "things very likely to be improved; 2 = on for everything; "
+           "(experimental) fusible = only for Tensorflow operations that XLA "
+           "knows how to fuse.  "
           "If set to single-gpu(<N>) then this resolves to <N> for single-GPU "
           "graphs (graphs that have at least one node placed on a GPU and no "
           "more than one GPU is in use through the entire graph) and 0 "
@ -78,6 +89,23 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
      Flag("tf_xla_max_cluster_size",
           &mark_for_compilation_flags->tf_xla_max_cluster_size,
           "Maximum number of operators in an XLA compilation."),
+      Flag(
+          "tf_xla_ops_to_cluster",
+          &mark_for_compilation_flags->tf_xla_ops_to_cluster,
+          "(experimental) "
+          "Limit the operations clustered by XLA to these operations. "
+          "If multiple, separate them with commas. Shortcuts: "
+          " PW: All point-wise operations."
+          " RED: All reduction operations."
+          " MISC: Mixed operations."
+          " PWRED: TF operations that get converted to PW+RED operation in XLA."
+          " REDUCEWINDOW: TF operations like MaxPool/AvgPool that get "
+          "converted to ReduceWindow in XLA."
+          " REDUCEWINDOWPW: Operation that get converted to ReduceWindow + PW "
+          "(LRN, LRNGrad)."
+          " BN: TF FusedBatchNorm* operations."
+          " FUSIBLE: All TF operations that XLA can fuse (All the above). "
+          "You can also put any TF operation name, e.g. 'FUSIBLE,Matmul'."),
      Flag("tf_xla_clustering_debug",
           &mark_for_compilation_flags->tf_xla_clustering_debug,
           "Dump graphs during XLA compilation."),
@ -127,6 +155,7 @@ void AllocateAndParseFlags() {

  device_flags = new XlaDeviceFlags;
  device_flags->tf_xla_compile_on_demand = false;
+  device_flags->tf_xla_enable_xla_devices = true;

  ops_flags = new XlaOpsCommonFlags;
  ops_flags->tf_xla_always_defer_compilation = false;
@ -159,6 +188,12 @@ void AllocateAndParseFlags() {
            "Switch a device into 'on-demand' mode, where instead of "
            "autoclustering ops are compiled one by one just-in-time."),

+       Flag("tf_xla_enable_xla_devices",
+            &device_flags->tf_xla_enable_xla_devices,
+            "Generate XLA_* devices, where placing a computation on such a "
+            "device"
+            "forces compilation by XLA. Deprecated."),
+
       Flag("tf_xla_always_defer_compilation",
            &ops_flags->tf_xla_always_defer_compilation, ""),

--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@ -55,6 +55,9 @@ struct MarkForCompilationPassFlags {
  // Maximum number of operators in an XLA compilation.
  int32 tf_xla_max_cluster_size;

+  // If non-empty, limit XLA clustering to the following TF operations.
+  string tf_xla_ops_to_cluster;
+
  // Dump graphs during XLA compilation.
  bool tf_xla_clustering_debug;

@ -84,6 +87,9 @@ struct XlaDeviceFlags {
  // Enabling this mode by a legacy flag is a temporary mechanism. When this
  // feature is battle-tested, we will switch this to be a session option.
  bool tf_xla_compile_on_demand;
+
+  // Enables "XLA" devices if this flag is set.
+  bool tf_xla_enable_xla_devices;
 };

 // Flags common to the _Xla* ops and their kernels.
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@ -123,7 +123,7 @@ class GraphCycles {
  absl::Span<const int32> Successors(int32 node) const;
  absl::Span<const int32> Predecessors(int32 node) const;

-  // Return a copy of the sucessors set. This is needed for code using the
+  // Return a copy of the successors set. This is needed for code using the
  // collection while modifying the GraphCycles.
  std::vector<int32> SuccessorsCopy(int32 node) const;
  // Return a copy of the predecessors set. This is needed for code using the
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@ -31,10 +31,10 @@ namespace tensorflow {

 // EncapsulateXlaComputationsPass rewrites computations generated by the
 // xla.compile() Python code into XlaLaunch nodes.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 26,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 36,
                      EncapsulateXlaComputationsPass);

-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 25,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 35,
                      IntroduceFloatingPointJitterPass);

 // from
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@ -1076,6 +1076,35 @@ StatusOr<bool> IsIdentityDrivingConstsInLoop(Node* node) {
  return true;
 }

+absl::flat_hash_set<string> GetOrCreateWhitelist() {
+  absl::flat_hash_map<string, std::vector<string>>* whitelist_table =
+      tensorflow::GetWhitelistTable();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
+  absl::flat_hash_set<string> whitelist;
+
+  for (auto s : absl::StrSplit(flags->tf_xla_ops_to_cluster, ',')) {
+    if (s == "FUSIBLE") {
+      for (auto pair : *whitelist_table) {
+        whitelist.insert(pair.second.begin(), pair.second.end());
+      }
+    } else if (whitelist_table->contains(s)) {
+      auto v = whitelist_table->at(s);
+      whitelist.insert(v.begin(), v.end());
+    } else if (!s.empty()) {
+      // Should be a user provided TF operation.
+      whitelist.insert(string(s));
+    }
+  }
+
+  if (VLOG_IS_ON(2) && !whitelist.empty()) {
+    std::vector<string> vwhitelist(whitelist.begin(), whitelist.end());
+    absl::c_sort(vwhitelist);
+    VLOG(2) << "XLA clustering will only consider the following TF operations: "
+            << absl::StrJoin(vwhitelist, " ");
+  }
+  return whitelist;
+}
+
 Status MarkForCompilationPassImpl::FindCompilationCandidates() {
  OptimizerOptions opts;
  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
@ -1087,9 +1116,8 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
      *graph_, /*compile_time_const_arg_indices=*/nullptr,
      &compile_time_const_nodes, lib_runtime));
-
  // Iterate over nodes in sorted order so that compiler fuel is deterministic.
-  // We can't simply pass op_nodes().begin() and op_nodes().end to the
+  // We can't simply pass op_nodes().begin() and op_nodes().end() to the
  // std::vector constructor because they're not proper iterators, with
  // iterator_traits defined and so on.
  std::vector<Node*> sorted_nodes;
@ -1108,6 +1136,19 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {

  VLOG(2) << "sorted_nodes.size() = " << sorted_nodes.size();

+  auto whitelist = GetOrCreateWhitelist();
+
+  std::vector<string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
+  absl::flat_hash_set<string> all_ops(vall_ops.begin(), vall_ops.end());
+  // Check that user's provided TF operation really exists.
+  for (auto s : whitelist) {
+    if (!all_ops.contains(string(s))) {
+      return errors::InvalidArgument(
+          "The operation '", s,
+          "' passed to --tf_xla_ops_to_cluster is not supported by XLA.");
+    }
+  }
+
  for (Node* node : sorted_nodes) {
    if (*debug_options_.fuel <= 0) {
      VLOG(1)
@ -1145,6 +1186,12 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
      continue;
    }

+    if (!whitelist.empty() && !whitelist.contains(node->def().op())) {
+      VLOG(1) << "Rejecting TF operation " << node->def().op()
+              << " as it is not listed in --tf_xla_ops_to_cluster.";
+      continue;
+    }
+
    if (compile_time_const_nodes[node->id()]) {
      const OpDef* op_def;
      TF_RETURN_IF_ERROR(
@ -1366,7 +1413,7 @@ Status MarkForCompilationPassImpl::Run() {
 void MarkForCompilationPassImpl::DumpPostClusteringGraphs() {
  DumpGraphToFile("mark_for_compilation", *graph_, flib_def_);

-  // We also dump out an annoated version of the TF graph where the nodes
+  // We also dump out an annotated version of the TF graph where the nodes
  // names are prefixed with the cluster names.  This can help visualizing the
  // clustering decisions on TensorBoard.
  Graph new_graph(graph_->op_registry());
@ -1714,7 +1761,303 @@ Status MarkForCompilationPass::RunForTest(
  return MarkForCompilation(options, debug_options);
 }

+absl::flat_hash_map<string, std::vector<string>>* GetWhitelistTable() {
+  // Table format: category name: {list of TF operations in that category}
+  static absl::flat_hash_map<string, std::vector<string>>* result =
+      new absl::flat_hash_map<string, std::vector<string>>{
+          // Unary
+          {"PW",
+           {"ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
+            "Atan", "Atanh", "Ceil", "Cos", "Cosh", "Sin", "Exp", "Expm1",
+            "Floor", "IsFinite", "IsInf", "IsNan", "Inv", "Reciprocal", "Log",
+            "Log1p", "Invert", "LogicalNot", "Ndtri", "Neg", "Rint", "Round",
+            "Rsqrt", "Sigmoid", "Sign", "Sinh", "Softplus", "Softsign", "Sqrt",
+            "Square", "Tan", "Tanh", "Real", "Imag", "Erf", "Erfc", "Erfinv",
+            "Lgamma", "Digamma",
+            // Binary
+            "Add", "AddV2", "Sub", "Mul", "Div", "Atan2", "Complex", "DivNoNan",
+            "MulNoNan", "FloorDiv", "Xlogy", "Xlog1py", "Xdivy", "FloorMod",
+            "BitwiseAnd", "BitwiseOr", "BitwiseXor", "LeftShift", "RightShift",
+            "LogicalAnd", "LogicalOr", "Mod", "Maximum", "Minimum", "RealDiv",
+            "ReciprocalGrad", "RsqrtGrad", "SqrtGrad", "TruncateDiv",
+            "TruncateMod", "Equal", "NotEqual", "Greater", "GreaterEqual",
+            "Less", "LessEqual", "SigmoidGrad", "SoftplusGrad", "SoftsignGrad",
+            "TanhGrad", "Pow", "SquaredDifference", "ApproximateEqual",
+            // Others
+            "AddN", "Bitcast", "Cast", "ClipByValue", "Const", "Empty",
+            "Identity", "IdentityN", "Relu", "Relu6", "ReluGrad", "Relu6Grad",
+            "LeakyReluGrad", "Elu", "EluGrad", "Selu", "SeluGrad", "Select",
+            "SelectV2", "Transpose", "ConjugateTranspose",
+            "_UnaryOpsComposition",
+            // The following 4 operations are converted to identity
+            "PlaceholderWithDefault", "PreventGradient", "StopGradient",
+            "Snapshot"}},
+          // clang-format off
+    {"RED",
+     {"All", "Any", "Min", "Max", "Mean", "Prod", "Sum"}},
+          // clang-format on
+          {"PWRED",
+           {"ArgMax", "ArgMin", "DiagPart", "Softmax",
+            "SparseSoftmaxCrossEntropyWithLogits", "LogSoftmax"}},
+          {"REDUCEWINDOW",
+           {"ArgMax", "ArgMin", "DiagPart", "Softmax",
+            "SparseSoftmaxCrossEntropyWithLogits", "LogSoftmax"}},
+          {"REDUCEWINDOWPW", {"BiasAddGrad", "LRN", "LRNGrad"}},
+          {"BN",
+           {"FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3",
+            "_FusedBatchNormEx", "FusedBatchNormGrad", "FusedBatchNormGradV2",
+            "FusedBatchNormGradV3"}},
+          {"SORT", {"TopKV2"}},  // XLA version much faster then TF version.
+          {"MISC",
+           // clang-format off
+     {"BroadcastTo", "ExpandDims", "Fill", "NoOp",
+      "Range", "Rank", "Reshape", "Shape", "ShapeN", "Size", "Squeeze",
+      "Transpose", "ZerosLike", "OnesLike", "BiasAdd" /*PW + Broadcast*/,
+      "BroadcastArgs", "BroadcastGradientArgs", "OneHot", "Concat", "ConcatV2",
+      "ConcatOffset", "Const", "MirrorPad", "Pack", "Pad", "PadV2", "Reverse",
+      "ReverseV2", "ReverseSequence", "Slice", "Split", "SplitV",
+      "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
+      "Tile", "Transpose", "InvertPermutation", "Unpack"}}};
+  // clang-format on
+  return result;
+}
+
 namespace testing {
 void ResetClusterSequenceNumber() { cluster_sequence_num = 0; }
+
+absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
+  absl::flat_hash_set<string> result{"AdjustContrastv2",
+                                     "AdjustHue",
+                                     "AdjustSaturation",
+                                     "Asinh",
+                                     "Assert",
+                                     "AssignAddVariableOp",
+                                     "AssignSubVariableOp",
+                                     "AssignVariableOp",
+                                     "AvgPool",
+                                     "AvgPool3D",
+                                     "AvgPool3DGrad",
+                                     "AvgPoolGrad",
+                                     "BatchMatMul",
+                                     "BatchMatMulV2",
+                                     "BatchToSpace",
+                                     "BatchToSpaceND",
+                                     "BesselI0e",
+                                     "BesselI1e",
+                                     "Betainc",
+                                     "BiasAddV1",
+                                     "Bucketize",
+                                     "Case",
+                                     "CheckNumerics",
+                                     "Cholesky",
+                                     "ControlTrigger",
+                                     "Conv2D",
+                                     "Conv2DBackpropFilter",
+                                     "Conv2DBackpropInput",
+                                     "Conv3D",
+                                     "Conv3DBackpropFilterV2",
+                                     "Conv3DBackpropInputV2",
+                                     "Cross",
+                                     "Cumprod",
+                                     "Cumsum",
+                                     "DataFormatDimMap",
+                                     "DataFormatVecPermute",
+                                     "DepthToSpace",
+                                     "DepthwiseConv2dNative",
+                                     "DepthwiseConv2dNativeBackpropFilter",
+                                     "DepthwiseConv2dNativeBackpropInput",
+                                     "Dequantize",
+                                     "Diag",
+                                     "DynamicStitch",
+                                     "Einsum",
+                                     "EmptyTensorList",
+                                     "ExtractImagePatches",
+                                     "FFT",
+                                     "FFT2D",
+                                     "FFT3D",
+                                     "FakeParam",
+                                     "FakeQuantWithMinMaxArgs",
+                                     "FakeQuantWithMinMaxArgsGradient",
+                                     "FakeQuantWithMinMaxVars",
+                                     "FakeQuantWithMinMaxVarsGradient",
+                                     "Gather",
+                                     "GatherNd",
+                                     "GatherV2",
+                                     "HSVToRGB",
+                                     "IFFT",
+                                     "IFFT2D",
+                                     "IFFT3D",
+                                     "IRFFT",
+                                     "IRFFT2D",
+                                     "IRFFT3D",
+                                     "If",
+                                     "InTopKV2",
+                                     "L2Loss",
+                                     "LeakyRelu",
+                                     "LinSpace",
+                                     "ListDiff",
+                                     "LogMatrixDeterminant",
+                                     "MatMul",
+                                     "MatrixBandPart",
+                                     "MatrixDiag",
+                                     "MatrixDiagPart",
+                                     "MatrixDiagPartV2",
+                                     "MatrixDiagPartV3",
+                                     "MatrixDiagV2",
+                                     "MatrixDiagV3",
+                                     "MatrixInverse",
+                                     "MatrixSetDiag",
+                                     "MatrixSetDiagV2",
+                                     "MatrixSetDiagV3",
+                                     "MatrixSolve",
+                                     "MatrixTriangularSolve",
+                                     "MaxPool",
+                                     "MaxPool3D",
+                                     "MaxPool3DGrad",
+                                     "MaxPool3DGradGrad",
+                                     "MaxPoolGrad",
+                                     "MaxPoolGradGrad",
+                                     "MaxPoolGradGradV2",
+                                     "MaxPoolGradV2",
+                                     "MaxPoolV2",
+                                     "Multinomial",
+                                     "NextAfter",
+                                     "NonMaxSuppressionV4",
+                                     "ParallelDynamicStitch",
+                                     "ParameterizedTruncatedNormal",
+                                     "PartitionedCall",
+                                     "Qr",
+                                     "QuantizeAndDequantizeV2",
+                                     "QuantizeAndDequantizeV3",
+                                     "RFFT",
+                                     "RFFT2D",
+                                     "RFFT3D",
+                                     "RGBToHSV",
+                                     "RandomShuffle",
+                                     "RandomStandardNormal",
+                                     "RandomUniform",
+                                     "RandomUniformInt",
+                                     "ReadVariableOp",
+                                     "ResizeBilinear",
+                                     "ResizeBilinearGrad",
+                                     "ResizeNearestNeighbor",
+                                     "ResourceApplyAdaMax",
+                                     "ResourceApplyAdadelta",
+                                     "ResourceApplyAdagrad",
+                                     "ResourceApplyAdagradDA",
+                                     "ResourceApplyAdagradV2",
+                                     "ResourceApplyAdam",
+                                     "ResourceApplyAddSign",
+                                     "ResourceApplyCenteredRMSProp",
+                                     "ResourceApplyFtrl",
+                                     "ResourceApplyFtrlV2",
+                                     "ResourceApplyGradientDescent",
+                                     "ResourceApplyKerasMomentum",
+                                     "ResourceApplyMomentum",
+                                     "ResourceApplyPowerSign",
+                                     "ResourceApplyProximalAdagrad",
+                                     "ResourceApplyProximalGradientDescent",
+                                     "ResourceApplyRMSProp",
+                                     "ResourceGather",
+                                     "ResourceScatterAdd",
+                                     "ResourceScatterDiv",
+                                     "ResourceScatterMax",
+                                     "ResourceScatterMin",
+                                     "ResourceScatterMul",
+                                     "ResourceScatterNdAdd",
+                                     "ResourceScatterNdSub",
+                                     "ResourceScatterNdUpdate",
+                                     "ResourceScatterSub",
+                                     "ResourceScatterUpdate",
+                                     "Roll",
+                                     "ScatterNd",
+                                     "SelfAdjointEigV2",
+                                     "SoftmaxCrossEntropyWithLogits",
+                                     "SpaceToBatch",
+                                     "SpaceToBatchND",
+                                     "SpaceToDepth",
+                                     "SparseMatMul",
+                                     "SparseToDense",
+                                     "StackCloseV2",
+                                     "StackPopV2",
+                                     "StackPushV2",
+                                     "StackV2",
+                                     "StatefulPartitionedCall",
+                                     "StatefulStandardNormalV2",
+                                     "StatefulTruncatedNormal",
+                                     "StatefulUniform",
+                                     "StatefulUniformFullInt",
+                                     "StatefulUniformInt",
+                                     "StatelessIf",
+                                     "StatelessMultinomial",
+                                     "StatelessRandomNormal",
+                                     "StatelessRandomUniform",
+                                     "StatelessRandomUniformInt",
+                                     "StatelessTruncatedNormal",
+                                     "StatelessWhile",
+                                     "Svd",
+                                     "SymbolicGradient",
+                                     "TensorArrayCloseV3",
+                                     "TensorArrayConcatV3",
+                                     "TensorArrayGatherV3",
+                                     "TensorArrayGradV3",
+                                     "TensorArrayReadV3",
+                                     "TensorArrayScatterV3",
+                                     "TensorArraySizeV3",
+                                     "TensorArraySplitV3",
+                                     "TensorArrayV3",
+                                     "TensorArrayWriteV3",
+                                     "TensorListElementShape",
+                                     "TensorListFromTensor",
+                                     "TensorListGather",
+                                     "TensorListGetItem",
+                                     "TensorListLength",
+                                     "TensorListPopBack",
+                                     "TensorListPushBack",
+                                     "TensorListReserve",
+                                     "TensorListSetItem",
+                                     "TensorListStack",
+                                     "TensorScatterAdd",
+                                     "TensorScatterSub",
+                                     "TensorScatterUpdate",
+                                     "TridiagonalSolve",
+                                     "TruncatedNormal",
+                                     "UnsortedSegmentMax",
+                                     "UnsortedSegmentMin",
+                                     "UnsortedSegmentProd",
+                                     "UnsortedSegmentSum",
+                                     "VarIsInitializedOp",
+                                     "VariableShape",
+                                     "While",
+                                     "XlaBroadcastHelper",
+                                     "XlaConv",
+                                     "XlaDequantize",
+                                     "XlaDot",
+                                     "XlaDynamicSlice",
+                                     "XlaDynamicUpdateSlice",
+                                     "XlaEinsum",
+                                     "XlaGather",
+                                     "XlaIf",
+                                     "XlaKeyValueSort",
+                                     "XlaPad",
+                                     "XlaRecv",
+                                     "XlaReduce",
+                                     "XlaReduceWindow",
+                                     "XlaReplicaId",
+                                     "XlaScatter",
+                                     "XlaSelectAndScatter",
+                                     "XlaSelfAdjointEig",
+                                     "XlaSend",
+                                     "XlaSharding",
+                                     "XlaSort",
+                                     "XlaSvd",
+                                     "XlaWhile",
+                                     "_Arg",
+                                     "_ArrayToList",
+                                     "_ListToArray",
+                                     "_Retval"};
+  return result;
+}
+
 }  // namespace testing
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@ -20,6 +20,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_
 #define TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_

+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/compilability_check_util.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"

@ -33,8 +34,9 @@ extern const char* const kXlaClusterAttr;
 // compilation by the encapsulate subgraphs pass.
 extern const char* const kXlaOutsideCompilationAttr;

-// Pass that marks a subset of operators in the graph with attribute
-// _XlaCluster so they are compiled by the EncapsulateSubgraphsPass.
+// Marks a subset of nodes in the graph which are to be clustered
+// with an attribute _XlaCluster=<cluster id> so they are picked up by the
+// EncapsulateSubgraphsPass.
 class MarkForCompilationPass : public GraphOptimizationPass {
 public:
  MarkForCompilationPass() = default;
@ -56,11 +58,16 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
                  RecursiveCompilabilityChecker::UncompilableNodesMap*
                      uncompilable_node_info = nullptr);

+absl::flat_hash_map<string, std::vector<string>>* GetWhitelistTable();
+
 namespace testing {
 // DO NOT USE IN PRODUCTION.
 //
 // Resets some internal state to let us write reliable unit tests.
 void ResetClusterSequenceNumber();
+
+// Return a list of operation that we choose not to put into the whitelist.
+absl::flat_hash_set<string> GetKnownXLAWhitelistOp();
 }  // namespace testing
 }  // namespace tensorflow

--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@ -1803,6 +1803,35 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
    EXPECT_NE(clusters["relu0"], clusters["relu1"]);
  }
 }
+TEST(XlaCompilationTest, XLALiteWhitelist) {
+  auto* whitelist_table = tensorflow::GetWhitelistTable();
+  absl::flat_hash_set<string> hwhitelist;
+  std::vector<string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
+  absl::flat_hash_set<string> all_ops(vall_ops.begin(), vall_ops.end());

+  // Check that all the operations in the table are existing TF operations
+  for (auto pair : *whitelist_table) {
+    hwhitelist.insert(pair.second.begin(), pair.second.end());
+    for (auto op : pair.second) {
+      ASSERT_TRUE(all_ops.contains(op));
+    }
+  }
+
+  // Check that all registered XLA operation are in the whitelist
+  // table or are known to not be in it.
+
+  absl::flat_hash_set<string> known_not_in_list =
+      tensorflow::testing::GetKnownXLAWhitelistOp();
+  std::vector<string> unknow_op;
+  for (string op : vall_ops) {
+    if (!hwhitelist.contains(op) && !known_not_in_list.contains(op)) {
+      unknow_op.push_back(op);
+    }
+  }
+  EXPECT_TRUE(unknow_op.empty())
+      << "Someone added support for a new TF opeations inside XLA. They must "
+         "be included in the XLALite whitelist or blacklist:\n"
+      << absl::StrJoin(unknow_op, "\n");
+}
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/node_matchers.h"

 #include <utility>
+
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/graph/graph_node_util.h"

 namespace tensorflow {
 namespace testing {
--- a/tensorflow/compiler/jit/node_matchers.h
+++ b/tensorflow/compiler/jit/node_matchers.h
@ -187,7 +187,7 @@ impl::NodeMatcherProperties Op(string op);
 // Matches a node with assigned device `assigned_device`.
 impl::NodeMatcherProperties AssignedDevice(string assigned_device);

-// Matches a node with a boolean typed attrbute named `name` and with value
+// Matches a node with a boolean typed attribute named `name` and with value
 // `value`.
 template <typename ValueTy>
 impl::NodeMatcherProperties Attr(const string& name, ValueTy value) {
--- a/tensorflow/compiler/jit/node_matchers_test.cc
+++ b/tensorflow/compiler/jit/node_matchers_test.cc
@ -125,7 +125,7 @@ TEST(NodeMatchers, CheckControlDependence) {
            "is any node");
 }

-TEST(NodeMatchers, ConstVaulue) {
+TEST(NodeMatchers, ConstValue) {
  Scope root = Scope::NewRootScope().ExitOnError();
  Output placeholder =
      ops::Placeholder(root.WithOpName("placeholder"), DT_FLOAT);
--- a/tensorflow/compiler/jit/ops/xla_ops.cc
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@ -110,7 +110,7 @@ Merges the outputs from the PartitionedCall node and the _XlaRun node.
 Unlike the TensorFlow Merge op, which requires inputs of some types to be
 placed on the host, the _XlaMerge op can merge inputs of all types when
 placed on the device. This prevents the need for copy operations, in
-particluar when an XLA cluster has int32 outputs. The _XlaMerge up does not
+particular when an XLA cluster has int32 outputs. The _XlaMerge up does not
 have a value_index output that identifies the chosen input.
 )");

--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/public/version.h"

--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@ -17,7 +17,10 @@ limitations under the License.

 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/util/dump_graph.h"

@ -39,7 +42,7 @@ Status ShapeHandleToTensorShape(shape_inference::InferenceContext* context,
  return PartialTensorShape::MakePartialShape(dims.data(), dims.size(), shape);
 }

-Status PropagateShapes(const Graph& graph,
+Status PropagateShapes(Graph* graph,
                       const std::map<int, InferredShape>& arg_shapes,
                       const std::vector<BackEdgeHelper::BackEdge>& back_edges,
                       ShapeRefiner* shape_refiner) {
@ -54,7 +57,7 @@ Status PropagateShapes(const Graph& graph,
  // shapes.
  // TODO(phawkins): handle cyclic graphs.
  std::vector<Node*> order;
-  GetReversePostOrder(graph, &order);
+  GetReversePostOrder(*graph, &order);

  for (Node* n : order) {
    // Ignore the status returned by the shape_refiner. We want the best effort
@ -99,6 +102,67 @@ Status PropagateShapes(const Graph& graph,
      }
    }

+    // Sometimes we have VariableShape nodes in while loop (after Enter nodes).
+    // They won't be constant-folded because TensorFlow constant folding does
+    // not handle Enter nodes (and thus does not handle any nodes after Enter
+    // nodes). We try to replace such VariableShape nodes with Const nodes here.
+    if (n->type_string() == "VariableShape") {
+      shape_inference::InferenceContext* context = shape_refiner->GetContext(n);
+      auto handle_shapes_and_types = context->input_handle_shapes_and_types(0);
+      if (handle_shapes_and_types && !handle_shapes_and_types->empty()) {
+        shape_inference::ShapeHandle handle =
+            handle_shapes_and_types->at(0).shape;
+        TensorShapeProto shape_proto;
+        context->ShapeHandleToProto(handle, &shape_proto);
+        if (!shape_proto.unknown_rank()) {
+          NodeDef const_def;
+          const_def.set_op("Const");
+          Node* var_node;
+          TF_RETURN_IF_ERROR(n->input_node(0, &var_node));
+          const_def.set_name(
+              graph->NewName(absl::StrCat("var_shape_", var_node->name())));
+          DataType dtype = n->output_type(0);
+          AddNodeAttr("dtype", dtype, &const_def);
+          TensorProto value;
+          value.set_dtype(dtype);
+          value.mutable_tensor_shape()->add_dim()->set_size(
+              shape_proto.dim_size());
+          for (const auto& dim : shape_proto.dim()) {
+            if (dtype == DT_INT32) {
+              value.add_int_val(dim.size());
+            } else {
+              value.add_int64_val(dim.size());
+            }
+          }
+          AddNodeAttr("value", value, &const_def);
+          for (auto const& attr : n->attrs()) {
+            if (*attr.first.begin() == '_') {
+              AddNodeAttr(attr.first, attr.second, &const_def);
+            }
+          }
+
+          Status s;
+          Node* const_node = graph->AddNode(const_def, &s);
+          TF_RETURN_IF_ERROR(s);
+
+          graph->AddControlEdge(var_node, const_node);
+          std::vector<const Edge*> out_edges(n->out_edges().begin(),
+                                             n->out_edges().end());
+          for (const Edge* e : out_edges) {
+            if (e->IsControlEdge()) {
+              graph->AddControlEdge(const_node, e->dst());
+              graph->RemoveEdge(e);
+            } else {
+              Node* dst = e->dst();
+              int dst_input = e->dst_input();
+              graph->RemoveEdge(e);
+              graph->AddEdge(const_node, 0, dst, dst_input);
+            }
+          }
+        }
+      }
+    }
+
    // Merge node causes a loop so we remove NextIteration->Merge edge before
    // performing shape inference. But removing those edges also prevents us
    // from inferring output shape for Merge node (we need shapes for all its
@ -196,7 +260,7 @@ Status InferShapes(Graph* graph, const std::map<int, InferredShape>& arg_shapes,
  // the shape inference is complete.
  BackEdgeHelper back_edge;
  TF_RETURN_IF_ERROR(back_edge.Remove(graph));
-  TF_RETURN_IF_ERROR(PropagateShapes(*graph, arg_shapes,
+  TF_RETURN_IF_ERROR(PropagateShapes(graph, arg_shapes,
                                     back_edge.RemovedEdges(), &shape_refiner));
  TF_RETURN_IF_ERROR(back_edge.Replace());

--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@ -36,8 +36,13 @@ class XlaCpuDeviceFactory : public DeviceFactory {
 };

 Status XlaCpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
-  devices->push_back(absl::StrCat("/physical_device:", DEVICE_XLA_CPU, ":0"));
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
+  if (!flags->tf_xla_enable_xla_devices) {
+    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    return Status::OK();
+  }

+  devices->push_back(absl::StrCat("/physical_device:", DEVICE_XLA_CPU, ":0"));
  return Status::OK();
 }

@ -45,6 +50,10 @@ Status XlaCpuDeviceFactory::CreateDevices(
    const SessionOptions& session_options, const string& name_prefix,
    std::vector<std::unique_ptr<Device>>* devices) {
  XlaDeviceFlags* flags = GetXlaDeviceFlags();
+  if (!flags->tf_xla_enable_xla_devices) {
+    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    return Status::OK();
+  }
  bool compile_on_demand = flags->tf_xla_compile_on_demand;

  XlaOpRegistry::DeviceRegistration registration;
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@ -140,7 +140,6 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
    // The device tensor should always be fresh.
    TF_RET_CHECK(!xla_tensor->has_shaped_buffer());

-    xla_tensor->set_host_tensor(*cpu_tensor);
    TF_RETURN_IF_ERROR(
        xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
                                         stream_->parent()->device_ordinal()));
@ -262,7 +261,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
                << xla_tensor->shaped_buffer().ToString();
        // For devices don't allow sync on completion, the device execution is
        // deferred. We check the execution stream status here to avoid wrong
-        // results from a failed stream being propogated to following
+        // results from a failed stream being propagated to following
        // host-side ops.
        if (!device_allows_sync_on_completion) {
          done_status.Update(xla_tensor->RefreshStatusOfStreams());
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@ -191,7 +191,7 @@ class XlaAssignVariableOp : public OpKernel {
  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
                          data::IteratorGetNextAsOptionalOp);                  \
  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE),          \
-                          data::IteratorGetNextSyncOp);                        \
+                          data::IteratorGetNextOp);                            \
  REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
                              .Device(DEVICE)                                  \
                              .HostMemory("string_handle"),                    \
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@ -17,9 +17,11 @@ limitations under the License.
 // operators using XLA via the XLA "CUDA" (GPU) backend.

 #include <set>
+
 #include "absl/memory/memory.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@ -61,6 +63,12 @@ class XlaGpuDeviceFactory : public DeviceFactory {
 };

 Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
+  if (!flags->tf_xla_enable_xla_devices) {
+    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    return Status::OK();
+  }
+
  auto platform = se::MultiPlatformManager::PlatformWithName("CUDA");
  if (!platform.ok()) {
    // Treat failures as non-fatal; there might not be a GPU in the machine.
@ -84,6 +92,12 @@ Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
 Status XlaGpuDeviceFactory::CreateDevices(
    const SessionOptions& session_options, const string& name_prefix,
    std::vector<std::unique_ptr<Device>>* devices) {
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
+  if (!flags->tf_xla_enable_xla_devices) {
+    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    return Status::OK();
+  }
+
  XlaOpRegistry::DeviceRegistration registration;
  registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
  registration.autoclustering_policy =
@ -109,6 +123,14 @@ Status XlaGpuDeviceFactory::CreateDevices(
    VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
    return Status::OK();
  }
+
+  auto iter = session_options.config.device_count().find("GPU");
+  if (iter != session_options.config.device_count().end() &&
+      iter->second == 0) {
+    // Device count for GPU is 0.
+    return Status::OK();
+  }
+
  string allowed_gpus =
      session_options.config.gpu_options().visible_device_list();
  absl::optional<std::set<int>> gpu_ids =
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@ -21,7 +21,7 @@ namespace tensorflow {

 bool XlaKernelCreator::CanCreateKernel(const FunctionLibraryRuntime& flr,
                                       const NodeDef& node_def) const {
-  return CanCreateXlaKernel(flr, node_def);
+  return CanCreateXlaKernel(node_def);
 }

 Status XlaKernelCreator::CreateKernel(FunctionLibraryRuntime* flr,
--- a/tensorflow/compiler/jit/xla_kernel_creator_test.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
@ -95,15 +95,17 @@ AttrValue BoolAttr(bool b) {

 TEST_F(XlaKernelCreatorTest, OneFloatOneResourceArgument) {
  FunctionDef fdef = XTimesY();
-  (*fdef.mutable_attr())["_XlaCompile"] = BoolAttr(true);
+  (*fdef.mutable_attr())["_XlaMustCompile"] = BoolAttr(true);
  Init({fdef});
  XlaKernelCreator xla_kernel_creator;
-
-  Status status = xla_kernel_creator.CreateKernel(
-      flr_, ToNodeDef(R"pb(
+  NodeDef callsite =
+      ToNodeDef(R"pb(
        name: 'XTimesY' op: 'XTimesY' input: 'a' input: 'b'
-      )pb"),
-      &kernel_);
+      )pb");
+  (*callsite.mutable_attr())["_XlaMustCompile"] = BoolAttr(true);
+
+  // Note: need to set attribute on the created node.
+  Status status = xla_kernel_creator.CreateKernel(flr_, callsite, &kernel_);
  ASSERT_TRUE(status.ok()) << status.ToString();

  EXPECT_EQ("XTimesY", kernel_->name());
@ -137,7 +139,7 @@ TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrNotSet) {

 TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrIsSetToFalse) {
  FunctionDef fdef = XTimesY();
-  (*fdef.mutable_attr())["_XlaCompile"] = BoolAttr(false);
+  (*fdef.mutable_attr())["_XlaMustCompile"] = BoolAttr(false);
  Init({fdef});
  XlaKernelCreator xla_kernel_creator;

--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
@ -23,7 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/ptr_util.h"

@ -68,40 +70,10 @@ class SinglePassSearch {
 };
 }  // namespace

-bool CanCreateXlaKernel(const FunctionLibraryRuntime& flr,
-                        const NodeDef& node_def) {
-  const FunctionDef* function_def =
-      flr.GetFunctionLibraryDefinition()->Find(node_def.name());
-  if (function_def == nullptr) {
-    // The node def is not calling a function. Individual ops can be
-    // run directly using on-demand mode, no need to create XlaLaunch
-    // kernel for them.
-    return false;
-  }
-
-  // If kXlaCompileAttr is set on the node_def, use its value.
-  const auto& it = node_def.attr().find(kXlaCompileAttr);
-  if (it != node_def.attr().end()) {
-    return it->second.b();
-  }
-
-  // kXlaCompileAttr is not set on node_def, check if it is set on
-  // FunctionDef.
-  bool xla_compile = false;
-  Status status = flr.GetFunctionLibraryDefinition()->GetAttr(
-      node_def, kXlaCompileAttr, &xla_compile);
-  if (!status.ok() || !xla_compile) {
-    if (VLOG_IS_ON(3)) {
-      if (!status.ok()) {
-        VLOG(3) << "No " << kXlaCompileAttr << " attr defined for "
-                << node_def.op() << ". status=" << status.ToString();
-      } else {
-        VLOG(3) << node_def.op() << " is explicitly marked not to be compiled";
-      }
-    }
-    return false;
-  }
-  return true;
+bool CanCreateXlaKernel(const NodeDef& node_def) {
+  // If kXlaMustCompileAttr is set on the node_def, use its value.
+  const auto& it = node_def.attr().find(kXlaMustCompileAttr);
+  return it != node_def.attr().end() && it->second.b();
 }

 // Given a FunctionLibraryRuntime and a NodeDef calling a function in the
@ -118,8 +90,11 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
  FunctionLibraryRuntime::Handle handle;
  // If node_def is not instantiable, e.g., the function does not exist,
  // simply bail out.
+  NameAttrList function;
+  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
+
  TF_RETURN_IF_ERROR(
-      flr->Instantiate(node_def.op(), AttrSlice(&node_def.attr()), &handle));
+      flr->Instantiate(function.name(), AttrSlice(&function.attr()), &handle));
  *fbody = flr->GetFunctionBody(handle);
  CHECK(*fbody);  // Can't be nullptr since we just instantiated it.
  const DataTypeVector& arg_types = (*fbody)->arg_types;
@ -149,7 +124,7 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,

 Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
                       std::unique_ptr<OpKernel>* kernel) {
-  if (!CanCreateXlaKernel(*flr, node_def)) {
+  if (!CanCreateXlaKernel(node_def)) {
    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
  }

@ -222,7 +197,7 @@ Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
  // using xla::ComputationDataHandle, which is just a symbolic handle that
  // xla::ComputationBuilder assigns. How does this handle gets assigned for
  // constant arguments? Even constant arguments get an _Arg node in the graph
-  // instatiated for Function compilation. The tf2xla kernel for constant _Arg
+  // instantiated for Function compilation. The tf2xla kernel for constant _Arg
  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
  // constant XlaLiteral is included in the HLO graph, and subsequently, in
@ -241,9 +216,7 @@ Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,

  // Create the kernel.
  NameAttrList function;
-  function.set_name(node_def.op());
-  *(function.mutable_attr()) = node_def.attr();
-
+  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
  Device* dev = flr->device();
  Status s;
  OpKernelConstruction construction(
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.h
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.h
@ -24,11 +24,9 @@ namespace tensorflow {
 class FunctionLibraryRuntime;
 class OpKernel;

-  // Given a NodeDef 'node_def' and the function library runtime 'flr', returns
-  // true if 'node_def' is a call to a compilable function defined in 'flr',
-  // with the kXlaCompileAttr set.
-bool CanCreateXlaKernel(const FunctionLibraryRuntime& flr,
-                        const NodeDef& node_def);
+// Given a NodeDef `node_def` returns true iff `node_def` has kXlaCompileAttr
+// set.
+bool CanCreateXlaKernel(const NodeDef& node_def);

 // Given a supported NodeDef, returns a XlaLaunchOp that computes the node.
 Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@ -84,9 +84,9 @@ VariableInfo::~VariableInfo() {
  }
 }

-// Returns a vector of VaribleInfo instances for the resource variable inputs to
-// the kernel with context `ctx`.  The input indices for the resource variable
-// inputs are in `variable_indices`.
+// Returns a vector of VariableInfo instances for the resource variable inputs
+// to the kernel with context `ctx`.  The input indices for the resource
+// variable inputs are in `variable_indices`.
 static Status GetVariableInfosFromCtxInputs(
    OpKernelContext* ctx, absl::Span<const int> variable_indices,
    std::vector<VariableInfo>* result) {
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@ -6,7 +6,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 package(
    default_visibility = [
        "//tensorflow/compiler/tf2xla:__subpackages__",
-        "@local_config_mlir//:friends",
+        "@llvm-project//mlir:friends",
    ],
    licenses = ["notice"],  # Apache 2.0
 )
@ -30,8 +30,8 @@ cc_library(
    hdrs = ["op_or_arg_name_mapper.h"],
    deps = [
        "@com_google_absl//absl/strings",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
    ],
 )

@ -43,11 +43,14 @@ cc_library(
        ":passes",
        "//tensorflow/core:lib",
        "//tensorflow/core/platform:logging",
-        "@llvm//:support",
-        "@local_config_mlir//:MlirOptLib",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:Support",
-        "@local_config_mlir//test:TestTransforms",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AffineDialectRegistration",
+        "@llvm-project//mlir:LoopDialectRegistration",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOpsDialectRegistration",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir/test:TestTransforms",
    ],
 )

@ -73,15 +76,17 @@ cc_library(
        "//tensorflow/compiler/mlir/xla:lhlo",
        "//tensorflow/compiler/mlir/xla:lhlo_fuse_linalg",
        "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_affine",
+        "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_gpu",
        "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_linalg",
        "//tensorflow/compiler/mlir/xla:xla_dialect_registration",
        "//tensorflow/compiler/mlir/xla:xla_legalize_control_flow",
        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
        "//tensorflow/compiler/mlir/xla:xla_legalize_to_standard",
-        "//tensorflow/compiler/mlir/xla:xla_lower_general_dot",
-        "@local_config_mlir//:AffineDialectRegistration",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:QuantOpsDialectRegistration",
+        "//tensorflow/compiler/mlir/xla:xla_lower",
+        "//tensorflow/compiler/mlir/xla:xla_materialize_broadcasts",
+        "//tensorflow/compiler/mlir/xla:xla_test_passes",
+        "@llvm-project//mlir:AffineOps",
+        "@llvm-project//mlir:QuantOps",
    ],
 )

@ -91,7 +96,7 @@ cc_library(
    hdrs = ["init_mlir.h"],
    deps = [
        "//tensorflow/core:lib",
-        "@llvm//:support",
+        "@llvm-project//llvm:support",
    ],
 )

@ -121,11 +126,11 @@ tf_cc_binary(
        "//tensorflow/core:tensorflow",
        "//tensorflow/stream_executor/lib",
        "@com_google_absl//absl/strings",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Support",
-        "@local_config_mlir//:TranslateClParser",
-        "@local_config_mlir//:Translation",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TranslateClParser",
+        "@llvm-project//mlir:Translation",
    ],
 )

--- a/tensorflow/compiler/mlir/README.md
+++ b/tensorflow/compiler/mlir/README.md
@ -1,11 +1,11 @@
 # MLIR dialects and utilities for TensorFlow, TensorFlow Lite and XLA.

 This module contains the MLIR
-([Multi-Level Intermediate Representation](https://github.com/tensorflow/mlir))
+([Multi-Level Intermediate Representation](https://mlir.llvm.org))
 dialects and utilities for

 1. TensorFlow
 2. XLA
 3. TF Lite

-See [MLIR repo](https://github.com/tensorflow/mlir) for complete documentation.
+See [MLIR's website](https://mlir.llvm.org) for complete documentation.
--- a/tensorflow/compiler/mlir/glob_lit_test.bzl
+++ b/tensorflow/compiler/mlir/glob_lit_test.bzl
@ -10,7 +10,7 @@ load("@bazel_skylib//lib:paths.bzl", "paths")

 # Default values used by the test runner.
 _default_test_file_exts = ["mlir", ".pbtxt", ".td"]
-_default_driver = "@local_config_mlir//:run_lit.sh"
+_default_driver = "@llvm-project//mlir:run_lit.sh"
 _default_size = "small"
 _default_tags = ["no_rocm"]

@ -50,16 +50,16 @@ def _run_lit_test(name, data, size, tags, driver, features):

    native.py_test(
        name = name,
-        srcs = ["@llvm//:lit"],
+        srcs = ["@llvm-project//llvm:lit"],
        tags = tags,
        args = [
            "tensorflow/compiler/mlir/" + paths.basename(data[-1]) + " --config-prefix=runlit -v",
        ] + features,
        data = data + [
            "//tensorflow/compiler/mlir:litfiles",
-            "@llvm//:FileCheck",
-            "@llvm//:count",
-            "@llvm//:not",
+            "@llvm-project//llvm:FileCheck",
+            "@llvm-project//llvm:count",
+            "@llvm-project//llvm:not",
        ],
        size = size,
        main = "lit.py",
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_binary")
 load(
-    "@local_config_mlir//:tblgen.bzl",
+    "//third_party/mlir:tblgen.bzl",
    "gentbl",
 )

@ -8,13 +8,14 @@ package(
    default_visibility = [
        # TODO(jpienaar): Make the visibility more restrictive.
        ":friends",
+        "//tensorflow/lite/experimental/tf_runtime:__subpackages__",
    ],
    licenses = ["notice"],  # Apache 2.0
 )

 package_group(
    name = "friends",
-    includes = ["@local_config_mlir//:subpackages"],
+    includes = ["//third_party/mlir:subpackages"],
    packages = [
        "//learning/brain/experimental/mlir/...",
        "//learning/brain/google/xla/...",
@ -27,7 +28,7 @@ filegroup(
    srcs = [
        "ir/tfl_ops.td",
        "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
-        "@local_config_mlir//:OpBaseTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
    ],
 )

@ -46,8 +47,16 @@ gentbl(
            "-gen-op-doc",
            "g3doc/tfl_ops.md",
        ),
+        (
+            "-gen-op-interface-decls",
+            "ir/tfl_ops_interface.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "ir/tfl_ops_interface.cc.inc",
+        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "ir/tfl_ops.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
@ -62,11 +71,11 @@ gentbl(
            "transforms/generated_prepare_tf.inc",
        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "transforms/prepare_patterns.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
-        "@local_config_mlir//:StdOpsTdFiles",
+        "@llvm-project//mlir:StdOpsTdFiles",
        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
        "//tensorflow/compiler/mlir/tensorflow:tensorflow_optimize_td_files",
    ],
@ -80,11 +89,11 @@ gentbl(
            "transforms/generated_lower_static_tensor_list.inc",
        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "transforms/tensorlist_patterns.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
-        "@local_config_mlir//:StdOpsTdFiles",
+        "@llvm-project//mlir:StdOpsTdFiles",
        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
    ],
 )
@ -97,11 +106,11 @@ gentbl(
            "transforms/generated_legalize_tf.inc",
        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "transforms/legalize_patterns.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
-        "@local_config_mlir//:StdOpsTdFiles",
+        "@llvm-project//mlir:StdOpsTdFiles",
        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
    ],
 )
@ -114,11 +123,12 @@ gentbl(
            "transforms/generated_optimize.inc",
        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "transforms/optimize_patterns.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
-        "@local_config_mlir//:StdOpsTdFiles",
+        "@llvm-project//mlir:StdOpsTdFiles",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
    ],
 )

@ -130,11 +140,11 @@ gentbl(
            "transforms/generated_quantize.inc",
        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "transforms/quantize_patterns.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
-        "@local_config_mlir//:StdOpsTdFiles",
+        "@llvm-project//mlir:StdOpsTdFiles",
    ],
 )

@ -146,11 +156,11 @@ gentbl(
            "transforms/generated_post_quantize.inc",
        ),
    ],
-    tblgen = "@local_config_mlir//:mlir-tblgen",
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
    td_file = "transforms/post_quantize_patterns.td",
    td_srcs = [
        ":tensorflow_lite_ops_td_files",
-        "@local_config_mlir//:StdOpsTdFiles",
+        "@llvm-project//mlir:StdOpsTdFiles",
    ],
 )

@ -163,9 +173,9 @@ cc_library(
        "utils/validators.h",
    ],
    deps = [
-        "@local_config_mlir//:Dialect",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:StandardOps",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
    ],
 )

@ -175,6 +185,8 @@ cc_library(
        "ir/tfl_ops.cc",
        "ir/tfl_ops.cc.inc",
        "ir/tfl_ops.h.inc",
+        "ir/tfl_ops_interface.cc.inc",
+        "ir/tfl_ops_interface.h.inc",
        "utils/attribute_utils.cc",
    ],
    hdrs = [
@ -183,21 +195,21 @@ cc_library(
        "transforms/passes.h",
        "utils/attribute_utils.h",
        "//tensorflow/compiler/mlir/lite/quantization:quantization_traits.h",
-        "@local_config_mlir//:include/mlir/Transforms/InliningUtils.h",
+        "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
    ],
    deps = [
        ":tensorflow_lite_ops_inc_gen",
        ":validators",
        "//tensorflow/compiler/mlir/tensorflow",
        "//tensorflow/lite/schema:schema_fbs",
-        "@llvm//:support",
-        "@local_config_mlir//:Analysis",
-        "@local_config_mlir//:Dialect",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
    ],
    alwayslink = 1,
 )
@ -214,10 +226,10 @@ cc_library(
    deps = [
        ":tensorflow_lite",
        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
    ],
 )

@ -231,9 +243,9 @@ cc_library(
    ],
    deps = [
        ":tensorflow_lite",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:StandardOps",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
    ],
 )

@ -246,10 +258,10 @@ tf_cc_test(
        "//tensorflow/compiler/mlir/tensorflow",
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
    ],
 )

@ -282,17 +294,22 @@ cc_library(
        ":validators",
        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:logging",
        "@com_google_absl//absl/memory",
-        "@llvm//:support",
-        "@local_config_mlir//:Analysis",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
-        "@local_config_mlir//:Transforms",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
    ],
    alwayslink = 1,
 )
@ -310,12 +327,12 @@ cc_library(
        ":tensorflow_lite",
        ":validators",
        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm//:support",
-        "@local_config_mlir//:Analysis",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
    ],
    alwayslink = 1,
 )
@ -323,6 +340,7 @@ cc_library(
 cc_library(
    name = "tensorflow_lite_quantize",
    srcs = [
+        "transforms/default_quant_params.cc",
        "transforms/generated_post_quantize.inc",
        "transforms/generated_quantize.inc",
        "transforms/load_quantization_recipe.cc",
@ -341,13 +359,13 @@ cc_library(
        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
        "//tensorflow/core:protos_all_cc",
        "@com_google_absl//absl/memory",
-        "@llvm//:support",
-        "@local_config_mlir//:Analysis",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
    ],
    alwayslink = 1,
 )
@ -369,7 +387,7 @@ genrule(
        "utils/generated_op_quant_spec_getters.inc",
    ],
    cmd = ("$(location //tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen) " +
-           "-I external/local_config_mlir/include " +
+           "-I external/llvm-project/mlir/include " +
           "-I external/org_tensorflow " +
           "$(location //tensorflow/compiler/mlir/lite:ir/tfl_ops.td) " + " -o $@"),
    tools = ["//tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen"],
@ -383,7 +401,7 @@ cc_library(
    ],
    deps = [
        ":tensorflow_lite",
-        "@local_config_mlir//:IR",
+        "@llvm-project//mlir:IR",
    ],
    alwayslink = 1,
 )
@ -394,9 +412,9 @@ tf_native_cc_binary(
        "operator_converter_gen.cc",
    ],
    deps = [
-        "@llvm//:support",
-        "@llvm//:tablegen",
-        "@local_config_mlir//:TableGen",
+        "@llvm-project//llvm:support",
+        "@llvm-project//llvm:tablegen",
+        "@llvm-project//mlir:TableGen",
    ],
 )

@ -429,12 +447,17 @@ cc_library(
    deps = [
        ":tensorflow_lite",
        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite/kernels/internal:kernel_utils",
        "//tensorflow/lite/schema:schema_fbs",
        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
        "@flatbuffers",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:TransformUtils",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
    ],
 )

@ -457,7 +480,7 @@ cc_library(
    ],
    deps = [
        "//tensorflow/lite/core/api",
-        "@local_config_mlir//:IR",
+        "@llvm-project//mlir:IR",
    ],
 )

@ -494,6 +517,7 @@ cc_library(
        "//tensorflow/lite:schema_fbs_version",
        "//tensorflow/lite:string_util",
        "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
+        "//tensorflow/lite/kernels/internal:kernel_utils",
        "//tensorflow/lite/schema:schema_fbs",
        "//tensorflow/lite/tools/versioning:op_version",
        "@com_google_absl//absl/base",
@ -502,14 +526,14 @@ cc_library(
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/strings",
        "@flatbuffers",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:QuantOpsDialectRegistration",
-        "@local_config_mlir//:StandardDialectRegistration",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
-        "@local_config_mlir//:Translation",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:QuantOpsDialectRegistration",
+        "@llvm-project//mlir:StandardDialectRegistration",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
    ],
    alwayslink = 1,
 )
@ -518,7 +542,7 @@ tf_cc_binary(
    name = "flatbuffer_translate",
    deps = [
        ":flatbuffer_translate_lib",
-        "@local_config_mlir//:MlirTranslateMain",
+        "@llvm-project//mlir:MlirTranslateMain",
    ],
 )

@ -531,7 +555,7 @@ cc_library(
        "tf_tfl_translate_cl.h",
    ],
    deps = [
-        "@llvm//:support",
+        "@llvm-project//llvm:support",
    ],
    alwayslink = 1,
 )
@ -543,7 +567,7 @@ cc_library(
    ],
    deps = [
        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
-        "@llvm//:support",
+        "@llvm-project//llvm:support",
    ],
 )

@ -571,9 +595,9 @@ tf_cc_binary(
        "//tensorflow/lite/schema:schema_fbs",
        "//tensorflow/stream_executor/lib",
        "@com_google_absl//absl/strings",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
    ],
 )

@ -584,16 +608,15 @@ tf_cc_binary(
        ":flatbuffer_translate_lib",
        "//tensorflow/core:lib",
        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform/default/build_config:base",
        "//tensorflow/lite:framework",
        "//tensorflow/lite/delegates/flex:delegate",
        "//tensorflow/lite/kernels:builtin_ops",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/strings",
-        "@llvm//:support",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Parser",
-        "@local_config_mlir//:Support",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
    ],
 )

@ -616,12 +639,12 @@ cc_library(
        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
        "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "@local_config_mlir//:Analysis",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:QuantOpsDialectRegistration",
-        "@local_config_mlir//:Transforms",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:QuantOpsDialectRegistration",
+        "@llvm-project//mlir:Transforms",
    ],
 )

@ -648,15 +671,15 @@ cc_library(
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/lite/tools/optimize:quantize_weights",
        "//tensorflow/stream_executor/lib",
-        "@llvm//:support",
-        "@local_config_mlir//:Analysis",
-        "@local_config_mlir//:IR",
-        "@local_config_mlir//:Parser",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:QuantOpsDialectRegistration",
-        "@local_config_mlir//:Support",
-        "@local_config_mlir//:Transforms",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:QuantOpsDialectRegistration",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
    ],
 )

--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .1.0
 .2.1